Web Analytics
Privacy Policy Cookie Policy Terms and Conditions Wikipédia:Utilitaires/extract SQL.py - Wikipédia

Wikipédia:Utilitaires/extract SQL.py

Un article de Wikipédia, l'encyclopédie libre.

Vous avez de nouveaux messages (diff ?).

#!/usr/bin/python

# name: utf2lat1.py

import codecs
import sys

# default encoder/decoder
uni2lat1 = codecs.getencoder('iso-8859-1')
utf82uni = codecs.getdecoder('utf-8')
nil_encode_decode = 0

# dictionary of coder/decoder indexed by input and ouput lang
encoder_names = dict({ 'fr': 'iso-8859-1', 'en': 'iso-8859-1' })

decoder_names = dict({ 'fr': 'utf-8', 'en': 'iso-8859-1' })

def initialise_codecs(lang):
    encoder_name = encoder_names.get(lang, 'iso-8859-1')
    decoder_name = decoder_names.get(lang, 'utf-8')
    if decoder_name == encoder_name:
        global nil_encode_decode
        nil_encode_decode = 1
        return

    print >> sys.stdout, 'Input lang: "%s", encoder: "%s", decoder "%s"' \
          % (lang, encoder_name, decoder_name)
    global uni2lat1
    uni2lat1 = codecs.getencoder(encoder_name)
    global utf82uni
    utf82uni = codecs.getdecoder(decoder_name)

def encode(str, file):
    if nil_encode_decode == 1:
        return str

    try:
      x, l = utf82uni(str)
    except:
        print >> sys.stderr, 'Invalid %s: "%s"' % (utf82uni.__name__, file)
        raise

    try:
        y, l = uni2lat1(x)

        ##print >> sys.stderr, "LAT1: %s" % (file)
        return y
    except:
        ##print >> sys.stderr, "UTF8: %s" % (file)
        ##raise
        res=[]
        for i in x:
            c = ord(i)
            if c < 256:
                res.append(chr(c))
            else:
                ## FIXME: use html entities if possible
                ##print >> sys.stderr, '%s: "%x"' % (file, c)
                res.append('\x26#%d;' % (c))


        res = ''.join(res)
        ##print >> sys.stderr, '=> "%s"' % (res)
        return res

if __name__ == '__main__':
    ##tst = 'éèçàùÉÈÇÀÙ'
    tst = 'é è'
    res = encode(tst)
    print >> sys.stderr, '"%s" -> "%s"' % (tst, res)


#!/usr/bin/python

# name: extract SQL.py

import re
import sys
import utf2lat1

big=0
init_codec=0
symlink = {}


p1 = re.compile(r"([^\\])'\),\(")
re_match_lang = re.compile("-- Host:.*Database: (.*)wiki")

def extract1(line):

    global init_codec

    # text from dump can contain something matching the regexp, init only
    # once to avoid some pitfall
    if init_codec == 0:
        match = re_match_lang.search(line)
        if repr(match) != 'None' and len(match.groups()) >= 1:
            lang = match.group(1)
            utf2lat1.initialise_codecs(lang)
            init_codec = 1

    if not line.startswith("INSERT INTO cur VALUES "):
        return []
    line = line[24:-3]
    line = p1.sub(r"\1'\n", line)
    l = line.split("\n")

    return l

p2 = re.compile("','")

def extract2(line):

    ##print "line: " + line[0:60]

    a = line.split(",", 2)
    ## a[0] = id
    ## a[1] = namespace
    ## a[2] = remaining

    ##print "a[2]: " + a[2][0:60]

    r = a[2][1:]        ## strip leading '
    b = r.split("','", 5)
    ## b[0] = title
    ## b[1] = article
    ## b[2] = summary + user_id + user_name 
    ## b[3] = timestamp
    ## b[4] = ...

    ## if a[1] == '6':
    ##   c = b[2].split(",'")
    ##   print "%s : %s" % (c[1], b[0])


    title = eval("'" + b[0] + "'")

    ## print "\tid = %s" % (a[0])
    ## print "\tns = %s" % (a[1])
    ## print "\tti = %s" % (title)
    ## print "\tts = %s" % (b[3])

    ##  (ns, title, text, time)
    return (a[1], title, b[1], b[3])


ns_name = {
        '0':    'article',
        '1':    'talk',
        '2':    'user',
        '3':    'user_talk',
        '4':    'wikipedia',
        '5':    'wikipedia_talk',
        '6':    'image',
        '7':    'image_talk',
        '8':    'msg',
        '9':    'msg_talk',
        '10':   'template',
        '11':   'template_talk',
        '12':   'help',
        '13':   'help_talk',
        '14':   'category',
        '15':   'category_talk'
}

def set_file_time(pathname, ts):
  import time
  import os

  # parse the timestamp string
  year = int(ts[0:4])
  mont = int(ts[4:6])
  day  = int(ts[6:8])
  hour = int(ts[8:10])
  min  = int(ts[10:12])
  sec  = int(ts[12:14])

  tuple = (year, mont, day, hour, min, sec, 0, 0, 0)
  try:
      t = time.mktime(tuple)
      os.utime(pathname, (t,t))
  except:
    print >> sys.stderr, "%s -> %s" % (ts, tuple)

def mkfilename(ns, name):
    name = name.replace('/', '%25')
    if ns != '0' or big == 0:
        return name

    n = name[0].upper()
    if n.isalnum() :
        return n + '/' + name
    else:
        return '-/' + name

def mklnkname(ns, name):
    if ns != '0' or big == 0:
        return mkfilename(ns, name)

    return '../' + mkfilename(ns, name)
        
def out(ns, name, data, ts):
  import os

  ##print "out %s:%s" % (ns, name)

  try:
    name = utf2lat1.encode(name, 'filename: ' + name)
  except:
    print "illégal namefile: '%s'" % (name)
    return

  pathname = ns_name[ns] + '/' + mkfilename(ns, name)

  ## search for redirect
  match = re.search(r'\s*#\s*REDIRECT\s*\[\[([^\]]*)\]\]', data, re.I)
  if match:     ## found a redirect
    ##print "%s REDIRECT to %s" % (name, match.group(1))
    dest = match.group(1).replace(' ', '_')
    if len(dest) >= 1:
        dest = dest[0].upper() + dest[1:]
        dest = mklnkname(ns, dest)
        symlink[pathname] = dest
  else:
    try:
      f = open(pathname, "w")
      f.write(utf2lat1.encode(data, pathname))
      f.close()
      set_file_time(pathname, ts)
    except:
      print >> sys.stderr, "can't write '%s'" % (pathname)
      ##raise
      return


for i in ns_name.values():
    import os
    print "create dir %s" % (i)
    os.system('rm -rf %s' % (i))
    os.mkdir(i)

if '-big' in sys.argv[1:]:
    big=1
    for i in "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-":
        os.mkdir(ns_name['0'] + '/' + i)

if '-utf8' in sys.argv[1:]:
    init_codec=1

for i in sys.stdin.xreadlines():
    l = extract1(i)
    for i in l:
        ns, title, txt, time = extract2(i)

        ##if ns != '0' and ns != '4' and ns != '6' and ns != '8':
        ##    continue

        t1 = re.sub(r"\\'", "'", txt)
        t2 = re.sub(r'\\"', '"', t1)
        t3 = re.sub(r"\\r\\n", "\n", t2)
        t4 = re.sub(r'\\n', '\n', t3)
        ##t5 = re.sub(r'\\\\', '\\', t4)

        out(ns, title, t4, time)


for pathname, dest in symlink.items():
    try:
      os.symlink(dest, pathname)
    except:
      print >> sys.stderr, 'Erreur: REDIRECT "%s" to "%s"' % (pathname, dest)



#!/usr/bin/python

# name: extract_SQL_UTF8.py

import re
import sys

big=0
symlink = {}


p1 = re.compile(r"([^\\])'\),\(")

def extract1(line):

    if not line.startswith("INSERT INTO cur VALUES "):
        return []
    line = line[24:-3]
    line = p1.sub(r"\1'\n", line)
    l = line.split("\n")

    return l

p2 = re.compile("','")

def extract2(line):

    ##print "line: " + line[0:60]

    a = line.split(",", 2)
    ## a[0] = id
    ## a[1] = namespace
    ## a[2] = remaining

    ##print "a[2]: " + a[2][0:60]

    r = a[2][1:]        ## strip leading '
    b = r.split("','", 5)
    ## b[0] = title
    ## b[1] = article
    ## b[2] = summary + user_id + user_name 
    ## b[3] = timestamp
    ## b[4] = ...

    ## if a[1] == '6':
    ##   c = b[2].split(",'")
    ##   print "%s : %s" % (c[1], b[0])


    title = eval("'" + b[0] + "'")

    ## print "\tid = %s" % (a[0])
    ## print "\tns = %s" % (a[1])
    ## print "\tti = %s" % (title)
    ## print "\tts = %s" % (b[3])

    ##  (ns, title, text, time)
    return (a[1], title, b[1], b[3])


ns_name = {
        '0':    'article',
        '1':    'talk',
        '2':    'user',
        '3':    'user_talk',
        '4':    'wikipedia',
        '5':    'wikipedia_talk',
        '6':    'image',
        '7':    'image_talk',
        '8':    'msg',
        '9':    'msg_talk',
        '10':   'template',
        '11':   'template_talk',
        '12':   'help',
        '13':   'help_talk',
        '14':   'category',
        '15':   'category_talk'
}

def set_file_time(pathname, ts):
  import time
  import os

  # parse the timestamp string
  year = int(ts[0:4])
  mont = int(ts[4:6])
  day  = int(ts[6:8])
  hour = int(ts[8:10])
  min  = int(ts[10:12])
  sec  = int(ts[12:14])

  tuple = (year, mont, day, hour, min, sec, 0, 0, 0)
  try:
      t = time.mktime(tuple)
      os.utime(pathname, (t,t))
  except:
    print >> sys.stderr, "%s -> %s" % (ts, tuple)

def mkfilename(ns, name):
    name = name.replace('/', '%25')
    if ns != '0' or big == 0:
        return name

    n = name[0].upper()
    if n.isalnum() :
        return n + '/' + name
    else:
        return '-/' + name

def mklnkname(ns, name):
    if ns != '0' or big == 0:
        return mkfilename(ns, name)

    return '../' + mkfilename(ns, name)
        
def out(ns, name, data, ts):
  import os

  pathname = ns_name[ns] + '/' + mkfilename(ns, name)

  ## search for redirect
  match = re.search(r'\s*#\s*REDIRECT\s*\[\[([^\]]*)\]\]', data, re.I)
  if match:     ## found a redirect
    ##print "%s REDIRECT to %s" % (name, match.group(1))
    dest = match.group(1).replace(' ', '_')
    if len(dest) >= 1:
        dest = dest[0].upper() + dest[1:]
        dest = mklnkname(ns, dest)
        symlink[pathname] = dest
  else:
    try:
      f = open(pathname, "w")
      f.write(data)
      f.close()
      set_file_time(pathname, ts)
    except:
      print >> sys.stderr, "can't write '%s'" % (pathname)
      ##raise
      return


for i in ns_name.values():
    import os
    print "create dir %s" % (i)
    os.system('rm -rf %s' % (i))
    os.mkdir(i)

if '-big' in sys.argv[1:]:
    big=1
    for i in "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-":
        os.mkdir(ns_name['0'] + '/' + i)

for i in sys.stdin.xreadlines():
    l = extract1(i)
    for i in l:
        ns, title, txt, time = extract2(i)

        ##if ns != '0' and ns != '4' and ns != '6' and ns != '8':
        ##    continue

        t1 = re.sub(r"\\'", "'", txt)
        t2 = re.sub(r'\\"', '"', t1)
        t3 = re.sub(r"\\r\\n", "\n", t2)
        t4 = re.sub(r'\\n', '\n', t3)
        ##t5 = re.sub(r'\\\\', '\\', t4)

        out(ns, title, t4, time)


for pathname, dest in symlink.items():
    try:
      os.symlink(dest, pathname)
    except:
      print >> sys.stderr, 'Erreur: REDIRECT "%s" to "%s"' % (pathname, dest)

THIS WEB:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia 2006:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu