Web Analytics
Privacy Policy Cookie Policy Terms and Conditions User:DhanakBot/szulhal.py - Wikipédia

User:DhanakBot/szulhal.py

A Wikipédiából, a szabad lexikonból.

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
Ez a bot a magyar wikipédia életrajzi lapjainak születési és halálozási
dátumait jegyzi be a megfelelő év- és nap-lapokra.

A következő argumentumokat ismeri:

    -cat:kategória     az adott kategória szócikkeit járja be

    -file:textfájl     a textfájlban hivatkozott lapokat járja be
                       (csak a [[lapnév]] fomájú hivatkozásokat tekinti)

    -ro                csak olvasás mód teszteléshez, nem módosít a lapokon

Minden más paraméter egy lap címeként értelmeződik; ilyenkor csak ezt az
egy életrajzi lapot dolgozza fel a bot.
"""

__version__='0.3'

import sys, re, time, codecs
import wikipedia, date, pagegenerators, catlib
import szulhalGUI

monthIdx = date.formats['MonthName'][wikipedia.getSite().language()]

yearre  = re.compile("([0-9]{3,4})")
monthre = re.compile("(?:" + '|'.join(date.makeMonthNamedList(wikipedia.getSite().language(), '%s')) + ")")
dayre   = re.compile("(" + monthre.pattern + ") +([0-9]{1,2})")

datere  = re.compile(yearre.pattern + "(?:\]\])?\.? +(?:\[\[)?(" + monthre.pattern + " [0-9]{1,2})")
headre  = re.compile(u"^.*?\((.*)\) *[.:;,-]? *(.*?)(?:\.(?! *század)|;|$)")

log = codecs.open("szulhal.log", "w+", "utf-8")

class SzulHalBot:
    def __init__(self, generator, headerLines = 10, readOnly = False):
        self.generator = generator
        self.headerLines = headerLines
        self.readOnly = readOnly
        self.modifications = False


    def run(self):
        data = []

        log.write("= %s =\n" % time.asctime(time.gmtime(time.time())))
        log.write(u"== Nem felismert életrajzi lapok ==\n")
        
        for page in self.generator:
            try:
                text = page.get()
            except wikipedia.NoPage:
                wikipedia.output(u'HIBA: Nem találom a %s lapot' % page.title())
                continue
            except wikipedia.IsRedirectPage:
                continue

            for line in text.splitlines()[0:self.headerLines]:
                hdmatch = headre.match(line)
                if hdmatch:
                    dates = (datere.findall(hdmatch.group(1)) + [(None, None)])
                else:
                    dates = []
                if len(dates) >= 2:
                    person = (page.title(),) + dates[0] + dates[1] + (hdmatch.group(2),)
                    data.append((line, person))
                    break
            else:
                log.write("* [[%s]]\n" % page.title())
                wikipedia.output(u"%s első %d sorában nem találtam a mintára illeszkedő fejet!" %
                                 (page.title(), self.headerLines))

        data = szulhalGUI.SzulHalGUI(data).display()
        if not data:
            return

        for person in data[1]:
            log.write("* [[%s]]\n" % person[0])
        data = data[0]

        log.write(u"== Sikeresen felismert életrajzi lapok ==\n")
        
        for person in data:
            self.checkPerson(*person)

        if self.modifications:
            log.seek(0)
            page = wikipedia.Page(wikipedia.getSite(), u"User:DhanakBot/Napló")
            try:
                naplo = page.get()
                page.put("".join(log.readlines()) + naplo, u"Robot: Elvégzett munka naplózása")
            except (wikipedia.NoPage, wikipedia.IsRedirectPage):
                wikipedia.output("A naplót nem tudtam frissíteni")


    def checkPerson(self, person, birthyear, birthday, deathyear, deathday, desc):
        """
        Egy személy összes adatát frissítí a (legfeljebb) négy dátumlapon.
        """
        log.write("* [[%s]]\n" % person)
        pages = [
            self.checkDatePage(birthyear, u"Robot: %s születése" % person, u"Születések",
                               person, dayCmp, birthday, u'†', deathyear, desc),
            self.checkDatePage(birthday, u"Robot: %s születése" % person, u"Születések",
                               person, yearCmp, birthyear, u'†', deathyear, desc)
            ]

        if deathyear and deathday:
            pages += [
                self.checkDatePage(deathyear, u"Robot: %s halála" % person, u"Halálozások",
                                   person, dayCmp, deathday, u'*', birthyear, desc),
                self.checkDatePage(deathday, u"Robot: %s halála" % person, u"Halálozások",
                                   person, yearCmp, deathyear, u'*', birthyear, desc)
                ]

        if not self.readOnly:
            for (page, newText, comment) in filter(None, pages):
                try:
                    page.put(newText, comment)
                    self.modifications = True
                except wikipedia.EditConflict, arg:
                    log.write(u"** HIBA: %s\n" % arg.args[0])
                    wikipedia.output("HIBA: %s" % arg.args[0])


    def checkDatePage(self, pageTitle, modCmt, *changeArgs):
        """
        Egy személy adatait frissíti a pageTitle dátumlapon.
        """
        log.write("** [[%s]] " % pageTitle)
        try:
            page = wikipedia.Page(wikipedia.getSite(), pageTitle)
            text = page.get()
            newText = self.changeSection(text, *changeArgs);
            wikipedia.showDiff(text, newText)
            if (text != newText):
                log.write(u"módosítva\n")
                return (page, newText, modCmt)
            else:
                log.write(u"módosítása nem szükséges\n")
        except SzulHalExn, arg:
            wikipedia.output(arg.args[0])
            log.write(arg.args[0] + "\n")
        except wikipedia.NoPage:
            log.write(u"nem létezik\n")
            wikipedia.output(u'Nem találom a %s lapot' % page.title())
        except wikipedia.IsRedirectPage:
            log.write(u"redirekt\n")
            pass

    def changeSection(self, text, section, *changeArgs):
        sectionre = re.compile("(== *" + section + " *==\n)(.*?)(?===|{{|\[\[en:)", re.DOTALL)
        (newText,subCnt) = sectionre.subn(InsertBirthDeath(*changeArgs), text, 1)
        if subCnt == 0:
            raise SzulHalExn(u"HIBA: nem találom a %s szakaszt" % section)
        return newText


class SzulHalExn(Exception):
    """Hiba"""

class InvalidLine(Exception):
    """Ismeretlen szerkezetű sor"""

class InsertBirthDeath:
    def __init__(self, personPage, whenCmp, when, otherChar, otherYear, description):
        self.personPage = personPage
        self.person = personPage.split(" (")[0]
        self.whenCmp = whenCmp
        self.when = when
        self.otherChar = otherChar
        self.otherYear = otherYear
        self.description = description
        

    def __call__(self, sectionMatch):
        sectionLines = sectionMatch.group(2).splitlines()
        if self.otherYear:
            otherText = " (" + self.otherChar + " [[" + self.otherYear + "]])"
        else:
            otherText = ""

        try:
            for i in range(len(sectionLines)):
                sectionLines[i] = self.tidy(sectionLines[i])
                
            for i in range(len(sectionLines)):
                if re.search(self.person, sectionLines[i]):
                    if otherText and not re.search(self.otherYear, sectionLines[i]):
                        sectionLines[i] += otherText
                    break
            else:
                line = "* [[" + self.when + "]]"
                if not yearre.match(self.when):
                    line += "."
                line += u" – [[" + self.personPage;
                if self.person != self.personPage:
                    line += "|" + self.person;
                line += "]]"
                if self.description:
                    line += ", " + self.description
                line += otherText
                sectionLines.append(line)

            sectionLines.sort(self.whenCmp)

            return sectionMatch.group(1) + '\n'.join(sectionLines) + "\n"

        except InvalidLine, arg:
            raise SzulHalExn(u"HIBA: ismeretlen szerkezetű sor: '%s'" % arg.args[0])


    def tidy(self, line):
        line = line.strip()
        if line == "":
            return line
        
        match = tidyre.match(line)
        if not match:
            wikipedia.output(u"FIGYELEM: Ennek a sornak nem ismertem fel a szerkezetét: '%s'" % line)
            return line

        line = "* [[" + match.group(1) + "]]"
        if not yearre.search(match.group(1)):
            line += "."
        line += u" – " + match.group(5) # groups 2, 3 and 4 are in yearre and dayre

        if match.group(6) and match.group(7): # otherchar and otheryear
            line += " ("
            if match.group(6) == '*' or match.group(6).startswith("sz"):
                line += "*"
            else:
                line += u"†"
            line += " [[" + match.group(7) + "]])"
            
        return line
        

tidyre = re.compile("^\* *(?:\[\[)?" +
                    "(" + yearre.pattern + "|" + dayre.pattern + ")" +
                    u"(?:\]\])?(?: *\.)? *[-–] *(.*?) *[.;]? *" +
                    u"(?: *\( *([*†]|sz.*?|m.*?|†) *(?:\[\[)?" +
                    yearre.pattern + "(?:\\]\])? *\) *[.;]? *)?$")

class DateCmp:
    def __init__(self, pattern, convert):
        self.regex = re.compile(pattern)
        self.convert = convert

    def __call__(self, str1, str2):
        if str1 == "" or str2 == "":
           return cmp(str1 == "", str2 == "") # True is larger than False
        
        (match1, match2) = map(self.regex.search, (str1, str2))
        if not match1:
            raise InvalidLine(str1)
        if not match2:
            raise InvalidLine(str2)

        return cmp(self.convert(*match1.groups()), self.convert(*match2.groups()))

def dayCnvt(monthstr, daystr):
    return (monthIdx(monthstr), int(daystr))

yearCmp = DateCmp(yearre, int)
dayCmp  = DateCmp(dayre, dayCnvt)


def main():
    # page generator
    gen = None
    readOnly = False
    pageTitle = []
    for arg in sys.argv[1:]:
        arg = wikipedia.argHandler(arg, 'szulhal')
        if arg:
            if arg.startswith('-file:'):
                gen = pagegenerators.TextfilePageGenerator(arg[6:])
            elif arg.startswith('-cat:'):
                cat = catlib.Category(wikipedia.getSite(), arg[5:])
                gen = pagegenerators.CategorizedPageGenerator(cat)
            elif arg == '-ro':
                readOnly = True
            else:
                pageTitle.append(arg)

    if pageTitle:
        page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
        gen = iter([page])

    if not gen:
        wikipedia.showHelp('szulhal')
    else:
        preloadinggen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
        bot = SzulHalBot(preloadinggen, readOnly = readOnly)
        bot.run()


if __name__ == "__main__":
    try:
        main()
        
    finally:
        wikipedia.stopme()
THIS WEB:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia 2006:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu