Web Analytics
Privacy Policy Cookie Policy Terms and Conditions Wikipédia:Utilitaires/xml-to-file.py - Wikipédia

Wikipédia:Utilitaires/xml-to-file.py

Un article de Wikipédia, l'encyclopédie libre.

Vous avez de nouveaux messages (diff ?).

import sys
import os
import re
import locale
import htmlentitydefs

class Siteinfo:
    def __init__(self, text):
        case = re.sub('(?s).*<case>(.*?)</case>.*', '\\1', text)
        self.first_letter_case = True
        if case == 'first-letter':
            self.first_letter_case = False
        self.namespace = {}
        # FIXME: this assume namespace 0 is unnamed
        for i in re.finditer('.*<namespace key="(\d+)">(.*)</namespace>', text):
            self.namespace[i.group(2) + ':'] = int(i.group(1))

def read_header():
    header = ''
    for line in sys.stdin:
        header += line
        if line.endswith('</siteinfo>\n'):
            return header

def build_regex_from_list(lst):
    regex = '('
    for v in lst:
        regex += v + '|'
    regex = regex[0:len(regex) - 1]
    regex += ')'
    return regex

def capitalize(s):
    s = unicode(s, 'utf-8')
    s = s[0].upper() + s[1:]
    return s.encode('utf-8', 'replace')

def replace_html_entity(matchobj):
    return unichr(htmlentitydefs.name2codepoint[matchobj.group(1)])

html_entities = build_regex_from_list(htmlentitydefs.entitydefs.keys())
html_entity = re.compile(u'&' + html_entities + u';')
def replace_entity(text):
    # first translate codepoint else &amp#996; will be translated
    #text = re.sub(u'&#(\d+|x[0-9a-fA-F]+);', replace_codepoint, text)
    text = html_entity.sub(replace_html_entity, unicode(text, 'utf-8'))
    return text.encode('utf-8', 'replace')

def split_title(title, siteinfo):
    if title.startswith(':'):
        title = title[1:]
    namespace = title.split(':')
    namespace = capitalize(namespace[0])
    if namespace + ':' in siteinfo.namespace.keys():
        namespace = namespace.replace(' ', '_')
        title = title[len(namespace + ':'):]
    else:
        namespace = 'Article'
    if len(title) == 0:
        return ('', '')
    title = title.split('#')[0]
    title = capitalize(title)
    title = title.strip(' ')
    title = title.replace(' ', '_')
    title = title.replace('/', '%2F')
    title = replace_entity(title)
    return (namespace, title)

def out_text(fd, text):
    text = replace_entity(text)
    print >> fd, text,

def create_symlink(last_namespace, last_title, namespace, title):
    if len(title) >= 1:
        # ignore self-redirect.
        if last_namespace != namespace or last_title != title:
            os.remove('fr/' + last_namespace + '/' + last_title)
            if last_namespace == namespace:
                os.symlink(title, 'fr/' + last_namespace + '/' + last_title)
            else:
                os.symlink('../' + namespace + '/' + title, 'fr/' + last_namespace + '/' + last_title)

end_text = re.compile('(.*)</text>')
start_text = re.compile('.*<text xml:space="preserve">(.*)')
title_text = re.compile('<title>(.*)</title>')
redirect_text = re.compile(r'\s*#\s*REDIRECT[^]]*\[\[([^]]*)\]\].*', re.I)
def parse_xml(f, siteinfo):
    in_article = False
    count = 0
    for line in f:
        match = title_text.search(line)
        if match:
            count += 1
            if count % 256 == 0:
                print >> sys.stderr, str(count) + '\r',
            title = match.group(1)
            (last_namespace, last_title) = split_title(title, siteinfo)
            if last_title != '':
                fd = open('fr/' + last_namespace + '/' + last_title, 'w')
            else:
                fd = None
        match = start_text.search(line)
        if match and last_title != '':
            text = match.group(1)
            in_article = True
            match = redirect_text.search(text)
            if match:
                (namespace, title) = split_title(match.group(1), siteinfo)
                create_symlink(last_namespace, last_title, namespace, title)
                in_article = False
                continue
            match2 = end_text.search(text)
            if match2:
                text = match2.group(1)
                in_article = False
            out_text(fd, text)
            if match2:
                fd = None
        else:
            match = end_text.search(line)
            if match and in_article:
                out_text(fd, match.group(1))
                in_article = False
                fd = None
            elif in_article:
                out_text(fd, line,);


def extract_files():
    locale.setlocale(locale.LC_CTYPE, 'fr_FR.utf8')
    locale.setlocale(locale.LC_COLLATE, 'fr_FR.utf8')
    header = read_header()
    siteinfo = Siteinfo(header)
    os.mkdir('fr')
    for f in siteinfo.namespace.keys():
        os.mkdir('fr' + '/' + f[0:len(f) - 1].replace(' ', '_'))
    os.mkdir('fr' + '/' + 'Article')
    parse_xml(sys.stdin, siteinfo)


if __name__ == "__main__":
    extract_files()
    #import profile
    #prof = profile.Profile()
    #try:
    #    prof.run('extract_files()')
    #finally:
    #     prof.print_stats()

THIS WEB:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia 2006:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu