Un article de Wikipédia, l'encyclopédie libre.
#!/usr/bin/python
# name: utf2lat1.py
import codecs
import sys
# default encoder/decoder
uni2lat1 = codecs.getencoder('iso-8859-1')
utf82uni = codecs.getdecoder('utf-8')
nil_encode_decode = 0
# dictionary of coder/decoder indexed by input and ouput lang
encoder_names = dict({ 'fr': 'iso-8859-1', 'en': 'iso-8859-1' })
decoder_names = dict({ 'fr': 'utf-8', 'en': 'iso-8859-1' })
def initialise_codecs(lang):
encoder_name = encoder_names.get(lang, 'iso-8859-1')
decoder_name = decoder_names.get(lang, 'utf-8')
if decoder_name == encoder_name:
global nil_encode_decode
nil_encode_decode = 1
return
print >> sys.stdout, 'Input lang: "%s", encoder: "%s", decoder "%s"' \
% (lang, encoder_name, decoder_name)
global uni2lat1
uni2lat1 = codecs.getencoder(encoder_name)
global utf82uni
utf82uni = codecs.getdecoder(decoder_name)
def encode(str, file):
if nil_encode_decode == 1:
return str
try:
x, l = utf82uni(str)
except:
print >> sys.stderr, 'Invalid %s: "%s"' % (utf82uni.__name__, file)
raise
try:
y, l = uni2lat1(x)
##print >> sys.stderr, "LAT1: %s" % (file)
return y
except:
##print >> sys.stderr, "UTF8: %s" % (file)
##raise
res=[]
for i in x:
c = ord(i)
if c < 256:
res.append(chr(c))
else:
## FIXME: use html entities if possible
##print >> sys.stderr, '%s: "%x"' % (file, c)
res.append('\x26#%d;' % (c))
res = ''.join(res)
##print >> sys.stderr, '=> "%s"' % (res)
return res
if __name__ == '__main__':
##tst = 'éèçàùÉÈÇÀÙ'
tst = 'é è'
res = encode(tst)
print >> sys.stderr, '"%s" -> "%s"' % (tst, res)
#!/usr/bin/python
# name: extract SQL.py
import re
import sys
import utf2lat1
big=0
init_codec=0
symlink = {}
p1 = re.compile(r"([^\\])'\),\(")
re_match_lang = re.compile("-- Host:.*Database: (.*)wiki")
def extract1(line):
global init_codec
# text from dump can contain something matching the regexp, init only
# once to avoid some pitfall
if init_codec == 0:
match = re_match_lang.search(line)
if repr(match) != 'None' and len(match.groups()) >= 1:
lang = match.group(1)
utf2lat1.initialise_codecs(lang)
init_codec = 1
if not line.startswith("INSERT INTO cur VALUES "):
return []
line = line[24:-3]
line = p1.sub(r"\1'\n", line)
l = line.split("\n")
return l
p2 = re.compile("','")
def extract2(line):
##print "line: " + line[0:60]
a = line.split(",", 2)
## a[0] = id
## a[1] = namespace
## a[2] = remaining
##print "a[2]: " + a[2][0:60]
r = a[2][1:] ## strip leading '
b = r.split("','", 5)
## b[0] = title
## b[1] = article
## b[2] = summary + user_id + user_name
## b[3] = timestamp
## b[4] = ...
## if a[1] == '6':
## c = b[2].split(",'")
## print "%s : %s" % (c[1], b[0])
title = eval("'" + b[0] + "'")
## print "\tid = %s" % (a[0])
## print "\tns = %s" % (a[1])
## print "\tti = %s" % (title)
## print "\tts = %s" % (b[3])
## (ns, title, text, time)
return (a[1], title, b[1], b[3])
ns_name = {
'0': 'article',
'1': 'talk',
'2': 'user',
'3': 'user_talk',
'4': 'wikipedia',
'5': 'wikipedia_talk',
'6': 'image',
'7': 'image_talk',
'8': 'msg',
'9': 'msg_talk',
'10': 'template',
'11': 'template_talk',
'12': 'help',
'13': 'help_talk',
'14': 'category',
'15': 'category_talk'
}
def set_file_time(pathname, ts):
import time
import os
# parse the timestamp string
year = int(ts[0:4])
mont = int(ts[4:6])
day = int(ts[6:8])
hour = int(ts[8:10])
min = int(ts[10:12])
sec = int(ts[12:14])
tuple = (year, mont, day, hour, min, sec, 0, 0, 0)
try:
t = time.mktime(tuple)
os.utime(pathname, (t,t))
except:
print >> sys.stderr, "%s -> %s" % (ts, tuple)
def mkfilename(ns, name):
name = name.replace('/', '%25')
if ns != '0' or big == 0:
return name
n = name[0].upper()
if n.isalnum() :
return n + '/' + name
else:
return '-/' + name
def mklnkname(ns, name):
if ns != '0' or big == 0:
return mkfilename(ns, name)
return '../' + mkfilename(ns, name)
def out(ns, name, data, ts):
import os
##print "out %s:%s" % (ns, name)
try:
name = utf2lat1.encode(name, 'filename: ' + name)
except:
print "illégal namefile: '%s'" % (name)
return
pathname = ns_name[ns] + '/' + mkfilename(ns, name)
## search for redirect
match = re.search(r'\s*#\s*REDIRECT\s*\[\[([^\]]*)\]\]', data, re.I)
if match: ## found a redirect
##print "%s REDIRECT to %s" % (name, match.group(1))
dest = match.group(1).replace(' ', '_')
if len(dest) >= 1:
dest = dest[0].upper() + dest[1:]
dest = mklnkname(ns, dest)
symlink[pathname] = dest
else:
try:
f = open(pathname, "w")
f.write(utf2lat1.encode(data, pathname))
f.close()
set_file_time(pathname, ts)
except:
print >> sys.stderr, "can't write '%s'" % (pathname)
##raise
return
for i in ns_name.values():
import os
print "create dir %s" % (i)
os.system('rm -rf %s' % (i))
os.mkdir(i)
if '-big' in sys.argv[1:]:
big=1
for i in "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-":
os.mkdir(ns_name['0'] + '/' + i)
if '-utf8' in sys.argv[1:]:
init_codec=1
for i in sys.stdin.xreadlines():
l = extract1(i)
for i in l:
ns, title, txt, time = extract2(i)
##if ns != '0' and ns != '4' and ns != '6' and ns != '8':
## continue
t1 = re.sub(r"\\'", "'", txt)
t2 = re.sub(r'\\"', '"', t1)
t3 = re.sub(r"\\r\\n", "\n", t2)
t4 = re.sub(r'\\n', '\n', t3)
##t5 = re.sub(r'\\\\', '\\', t4)
out(ns, title, t4, time)
for pathname, dest in symlink.items():
try:
os.symlink(dest, pathname)
except:
print >> sys.stderr, 'Erreur: REDIRECT "%s" to "%s"' % (pathname, dest)
#!/usr/bin/python
# name: extract_SQL_UTF8.py
import re
import sys
big=0
symlink = {}
p1 = re.compile(r"([^\\])'\),\(")
def extract1(line):
if not line.startswith("INSERT INTO cur VALUES "):
return []
line = line[24:-3]
line = p1.sub(r"\1'\n", line)
l = line.split("\n")
return l
p2 = re.compile("','")
def extract2(line):
##print "line: " + line[0:60]
a = line.split(",", 2)
## a[0] = id
## a[1] = namespace
## a[2] = remaining
##print "a[2]: " + a[2][0:60]
r = a[2][1:] ## strip leading '
b = r.split("','", 5)
## b[0] = title
## b[1] = article
## b[2] = summary + user_id + user_name
## b[3] = timestamp
## b[4] = ...
## if a[1] == '6':
## c = b[2].split(",'")
## print "%s : %s" % (c[1], b[0])
title = eval("'" + b[0] + "'")
## print "\tid = %s" % (a[0])
## print "\tns = %s" % (a[1])
## print "\tti = %s" % (title)
## print "\tts = %s" % (b[3])
## (ns, title, text, time)
return (a[1], title, b[1], b[3])
ns_name = {
'0': 'article',
'1': 'talk',
'2': 'user',
'3': 'user_talk',
'4': 'wikipedia',
'5': 'wikipedia_talk',
'6': 'image',
'7': 'image_talk',
'8': 'msg',
'9': 'msg_talk',
'10': 'template',
'11': 'template_talk',
'12': 'help',
'13': 'help_talk',
'14': 'category',
'15': 'category_talk'
}
def set_file_time(pathname, ts):
import time
import os
# parse the timestamp string
year = int(ts[0:4])
mont = int(ts[4:6])
day = int(ts[6:8])
hour = int(ts[8:10])
min = int(ts[10:12])
sec = int(ts[12:14])
tuple = (year, mont, day, hour, min, sec, 0, 0, 0)
try:
t = time.mktime(tuple)
os.utime(pathname, (t,t))
except:
print >> sys.stderr, "%s -> %s" % (ts, tuple)
def mkfilename(ns, name):
name = name.replace('/', '%25')
if ns != '0' or big == 0:
return name
n = name[0].upper()
if n.isalnum() :
return n + '/' + name
else:
return '-/' + name
def mklnkname(ns, name):
if ns != '0' or big == 0:
return mkfilename(ns, name)
return '../' + mkfilename(ns, name)
def out(ns, name, data, ts):
import os
pathname = ns_name[ns] + '/' + mkfilename(ns, name)
## search for redirect
match = re.search(r'\s*#\s*REDIRECT\s*\[\[([^\]]*)\]\]', data, re.I)
if match: ## found a redirect
##print "%s REDIRECT to %s" % (name, match.group(1))
dest = match.group(1).replace(' ', '_')
if len(dest) >= 1:
dest = dest[0].upper() + dest[1:]
dest = mklnkname(ns, dest)
symlink[pathname] = dest
else:
try:
f = open(pathname, "w")
f.write(data)
f.close()
set_file_time(pathname, ts)
except:
print >> sys.stderr, "can't write '%s'" % (pathname)
##raise
return
for i in ns_name.values():
import os
print "create dir %s" % (i)
os.system('rm -rf %s' % (i))
os.mkdir(i)
if '-big' in sys.argv[1:]:
big=1
for i in "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-":
os.mkdir(ns_name['0'] + '/' + i)
for i in sys.stdin.xreadlines():
l = extract1(i)
for i in l:
ns, title, txt, time = extract2(i)
##if ns != '0' and ns != '4' and ns != '6' and ns != '8':
## continue
t1 = re.sub(r"\\'", "'", txt)
t2 = re.sub(r'\\"', '"', t1)
t3 = re.sub(r"\\r\\n", "\n", t2)
t4 = re.sub(r'\\n', '\n', t3)
##t5 = re.sub(r'\\\\', '\\', t4)
out(ns, title, t4, time)
for pathname, dest in symlink.items():
try:
os.symlink(dest, pathname)
except:
print >> sys.stderr, 'Erreur: REDIRECT "%s" to "%s"' % (pathname, dest)