User:Purodha/translatorlanguages.py

From translatewiki.net
Jump to: navigation, search
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Get translators language portal pages from translatewiki.net
"""

#
# (C) Purodha Blissenbach, 2011.
#
__version__ = "$Id: editarticle.py 8365 2010-07-26 20:52:10Z purodha $"
#
# Distributed under the terms of the MIT license.
#

# Version 1.8.
#
# Note: This program will take several minutes to read all the language
#       portals translators pages.
# TODO:
#       find the last edit of a message of a language, but ignore
#       FUZZYBOT and nontranslators, or excluded credit takers.
#       Finalize Ethnologue tree

__metaclass__ = type
import string
import re
import wikipedia as pywikibot
import pagegenerators
import config
import urllib2
import time

class TranslatorPortalCollector:
    # Fix constants
    SupportedLanguagesPageTitle = u'Special:SupportedLanguages'
    ListedLanguagesPageTitle = u'Languages by language family'
    translators = u'/translators'
    nsNamePortal = u'Portal'
    # Computed constants
    pageLinkSupportedLanguages = None
    PageLinkListedLanguages = None
    translalen = None
    nsPortal = None
    site = None
    pageTitle = {}

    def __init__(self, *args):
        config.family = 'wikipedia'
        config.language = 'de'
        config.family = 'i18n'
        config.language = 'i18n'
        self.site = pywikibot.getSite( config.language, config.family )
        # pywikibot.output( self.site.sitename() )
        self.nsPortal = self.site.getNamespaceIndex(self.nsNamePortal)
        self.portalen = len(self.nsNamePortal) + 1
        self.translalen = len(self.translators)
#       self.portalsWithRedirectedPortalPage = {}
        self.portalsWithRedirectedTranslatorsPage = {}
        self.portalsWithTranslators = {}
        self.portalsWithNonZeroTranslators = {}
        self.pagePortalLinks = {}
        self.SupportedLanguages = {}

    def getSupportedLanguages(self, pageTitle):
        self.pageTitle['S'] = pageTitle
        try:
            page = pywikibot.Page( self.site, pageTitle )
            page = page.urlname()
            # pywikibot.output( page )
            page = '%s?title=%s' % (self.site.path(), page )
            # pywikibot.output( page )
            text = self.site.getUrl( page )
            tags = [ 'comments', 'pre' ]
            text = pywikibot.removeDisabledParts(text, tags)
            # pywikibot.output( text )
            portalR = re.compile(r'href="[^"]*[\/=][Pp][Oo][Rr][Tt][Aa][Ll]\s?:\s?([a-zA-Z\-]+)"\s?')
            for lang in portalR.findall(text):
                    lang = lang.lower()
                    # pywikibot.output( u'%s -- %s --' % ( lang, lang ) )
                    self.SupportedLanguages[lang] = lang
        except pywikibot.exceptions.PageNotFound:
            pywikibot.output(u'Error: page "%s" not found.' % pageTitle)

    def getListedLanguages(self, pageTitle):
        self.pageTitle['L'] = pageTitle
        try:
            page = pywikibot.Page(self.site, pageTitle)
            if page.isRedirectPage():
                page = page.getRedirectTarget()
            #TEST: self.pagePortalLinks['mil'] = 'mul'
            tags = ['comments', 'nowiki', 'pre', 'source']
            text = pywikibot.removeDisabledParts(page.get(), tags)
            # pywikibot.output( text )
            portalR = re.compile(r'\[\[[Pp][Oo][Rr][Tt][Aa][Ll]\s?:\s?([a-zA-Z\-]+)\s?\|([^\[\]\n]*)\]\]')
            for lang, pagetitle in portalR.findall(text):
                lang = lang.lower()
                #pywikibot.output( u'%s -- %s --' % ( lang, pagetitle ) )
                self.pagePortalLinks[lang] = pagetitle
        except pywikibot.NoPage:
            pywikibot.output(u'Error: page "%s" not found.' % pageTitle)

    def getPortalsWithTranslators(self):
        generator = self.site.allpages(namespace=self.nsPortal)
        for page in generator:
            title = page.title()
            if title[-self.translalen:] == self.translators:
                lang = title[self.portalen:-self.translalen]
                if page.isRedirectPage():
                    self.portalsWithRedirectedTranslatorsPage[lang.lower()] = lang
#                   self.portalsWithRedirectedPortalPage[lang.lower()] = lang
                else:
                    self.portalsWithTranslators[lang.lower()] = lang

    def getPortalsWithNonZeroTranslators(self):
        if not self.portalsWithTranslators:
            self.getPortalsWithTranslators()
        for lang in self.portalsWithTranslators:
            subPageTitle = ( '%s:%s%s' % ( self.nsNamePortal, lang, self.translators ) )
            # pywikibot.output( subPageTitle )
            try:
                page = pywikibot.Page( self.site, subPageTitle )
                if page.isRedirectPage():
                    page = page.getRedirectTarget()
                tags = ['comments', 'nowiki', 'pre', 'source']
                text = pywikibot.removeDisabledParts(page.get(), tags)
                # pywikibot.output( text )
                userLinkR = re.compile(r'\[\[[Uu][Ss][Ee][Rr]\s?:([^\[\]\n]*)\]\]')
                for user in userLinkR.findall(text):
                    try:
                        self.portalsWithNonZeroTranslators[lang] = self.portalsWithNonZeroTranslators[lang] + 1
                    except KeyError:
                        self.portalsWithNonZeroTranslators[lang] = 1
                userTemplateR = re.compile(r'\{\{[Uu]ser\s?\|([^\[\]\n]*)\}\}')
                for user in userTemplateR.findall(text):
                    try:
                        self.portalsWithNonZeroTranslators[lang] = self.portalsWithNonZeroTranslators[lang] + 1
                    except KeyError:
                        self.portalsWithNonZeroTranslators[lang] = 1
            except pywikibot.NoPage:
                pywikibot.output(u'Error: page "%s" disappeared.' % subPageTitle)

    def pageLink(self, title, anchor ):
        return ( '[[%s|%s]]' % (title, anchor) )

    def printLine(self, line):
        print( line )

    def printHeadLine(self, line ):
        Template = re.compile( r'(\{([LPST])\:([^}]*)\})' )
        for matched, key, anchor in Template.findall( line ):
            # pywikibot.output( ' %s %s %s ' % ( matched, key, anchor ) )
            try:
                anchor = self.pageLink( self.pageTitle[key], anchor)
            except KeyError:
                pass
            line = line.replace( matched, anchor )
        self.printLine( '== %s ==' % line )

    def printLanguage(self, lang, anchor = ''):
        # pywikibot.output( u'# [[Portal:%s|%s]]' % ( lang, anchor ) )
        self.printLine( u'# [[Portal:%s|%s]]' % ( lang, anchor ) )

    def printSortedLanguageList(self, langlist):
        languages = sorted(langlist.keys())
        for lang in languages:
            self.printLanguage(lang)

    def printSortedLanguageListDifference(self, langlist, exceptdict ):
        languages = sorted(langlist.keys())
        for lang in languages:
            if not lang in exceptdict:
                self.printLanguage(lang)
        return
#        try:
#                if exceptdict[lang]:
#                    continue
#        except KeyError:
#                self.printLanguage(lang)

    def showLanguagesWithTranslators(self):
        self.printHeadLine( u'Languages with {T:translators subpages}')
        self.printSortedLanguageList( self.portalsWithTranslators )

    def showLanguagesWithNonZeroTranslators(self):
        self.printHeadLine( u'Languages with nonzero translators')
        self.printSortedLanguageList( self.portalsWithNonZeroTranslators )

    def showLanguagesWithZeroTranslators(self):
        self.printHeadLine( u'Languages with {T:translators subpages} and zero translators')
        self.printSortedLanguageListDifference( self.portalsWithTranslators, self.portalsWithNonZeroTranslators )

    def showListedLanguages(self):
        self.printHeadLine( u'{L:Listed Languages}')
        self.printSortedLanguageList( self.pagePortalLinks )

    def showListedLanguagesWithoutTranslators(self):
        self.printHeadLine( u'{L:Listed languages} without {T:translators subpages}')
        self.printSortedLanguageListDifference( self.pagePortalLinks, self.portalsWithTranslators )

    def showListedLanguagesWithZeroTranslators(self):
        self.printHeadLine( u'{L:Listed languages} with zero translators')
        self.printSortedLanguageListDifference( self.pagePortalLinks, self.portalsWithNonZeroTranslators )

    def showLanguagesNotListedDespiteTranslators(self):
        self.printHeadLine( u'Languages not {L:listed} despite translators')
        self.printSortedLanguageListDifference( self.portalsWithNonZeroTranslators, self.pagePortalLinks )

    def showLanguagesdWithRedirectedPortalPage(self):
        self.printHeadLine( u'Languages with {P:Portal page} being a redirect')
        self.printSortedLanguageList( self.portalsWithRedirectedPortalPage )

    def showLanguagesdWithRedirectedTranslatorsPage(self):
        self.printHeadLine( u'Languages with {T:translators subpage} being a redirect')
        self.printSortedLanguageList( self.portalsWithRedirectedTranslatorsPage )

    def showSupportedLanguages(self):
        self.printHeadLine( u'{S:Supported Languages}')
        self.printSortedLanguageList( self.SupportedLanguages )

    def showListedLanguagesNotSupported(self):
        self.printHeadLine( u'Languages {L:listed} but not {S:supported}')
        self.printSortedLanguageListDifference( self.pagePortalLinks, self.SupportedLanguages )

    def showSupportedLanguagesNotListed(self):
        self.printHeadLine( u'Languages {S:supported} but not {L:listed}' )
        self.printSortedLanguageListDifference( self.SupportedLanguages, self.pagePortalLinks )

    def run(self):
        self.getSupportedLanguages( self.SupportedLanguagesPageTitle)
        self.showSupportedLanguages()

        self.getListedLanguages( self.ListedLanguagesPageTitle )
        self.showListedLanguages()

        self.getPortalsWithNonZeroTranslators()
        self.showLanguagesWithTranslators()
        self.showLanguagesWithNonZeroTranslators()
        self.showLanguagesWithZeroTranslators()
#       self.showLanguagesdWithRedirectedPortalPage()
        self.showLanguagesdWithRedirectedTranslatorsPage()

        self.showListedLanguagesWithoutTranslators()
        self.showListedLanguagesWithZeroTranslators()
        self.showLanguagesNotListedDespiteTranslators()

        self.showListedLanguagesNotSupported()
        self.showSupportedLanguagesNotListed()
        return

# cookieProcessor = urllib2.HTTPCookieProcessor(cj)
MyURLopener = urllib2.build_opener(pywikibot.U2RedirectHandler)

class EthnologueLanguageTree:
    baseURL = 'http://www.ethnologue.com/show_language.asp?code='
    verbose = True
    codeMap = {}
    list = {}
    tree = {}

    def __init__(self, *args):
        # self.nameR = re.compile(r'<h1>')
        self.nameR = re.compile(r'<h1>\s*([^<]+?)\s*<\/h1>')
        self.treeR = re.compile(r'(?s)>Classification<\/a>.*?<a\s[^>]+>\s*([^<]+?)\s*?<\/a>')
        self.spltR = re.compile(r'\s*,\s*')
        self.naecR = re.compile(r'Not an Ethnologue[ 0-9]*language code|Invalid language code|New language identifier code')

    def pageUrl( self, code ):
        return ( '%s%s' % ( self.baseURL, code ) )

    def getPage( self, url, retry=True, data='', compress=True, no_hostname=True, back_response=False, verbose=verbose ):
        if retry:
            retry_attempt = config.maxretries
        else:
            retry_attempt = 1
        retry_idle_time = 1
        while True:
            try:
                request = urllib2.Request(url) #, data, headers)
                f = MyURLopener.open(request)
                # read & info can raise socket.error
                text = f.read()
                # pywikibot.output( text )
                headers = f.info()
                # pywikibot.output( headers )
                break
            except KeyboardInterrupt:
                raise
            except urllib2.HTTPError, e:
                if e.code in [401, 404]:
                    raise PageNotFound( u'Page %s could not be retrieved.' % url)
                elif e.code in [403]:
                    raise PageNotFound( u'Page %s could not be retrieved. Check your virus wall.' % url)
                elif e.code == 504:
                    output(u'HTTPError: %s %s' % (e.code, e.msg))
                    if retry_attempt > 0:
                        retry_attempt -= 1
                        output(
u"WARNING: Could not open '%s'.\n Maybe the server or your connection is down. Retrying in %i minutes..."
                               % (url, retry_idle_time))
                        time.sleep(retry_idle_time * 60)
                        # Next time wait longer,
                        # but not longer than half an hour
                        retry_idle_time *= 2
                        if retry_idle_time > 30:
                            retry_idle_time = 30
                        continue
                    else:
                        raise MaxTriesExceededError()
                    raise
                else:
                    output(u"Result: %s %s" % (e.code, e.msg))
                    raise
            except Exception, e:
                pywikibot.output(u'%s' %e)
                if retry_attempt > 0:
                    retry_attempt -= 1
                    pywikibot.output(
u"WARNING: Could not open '%s'.\n Maybe the server or your connection is down. Retrying in %i minutes..."
                           % (url, retry_idle_time))
                    time.sleep(retry_idle_time * 60)
                    retry_idle_time *= 2
                    if retry_idle_time > 30:
                        retry_idle_time = 30
                    continue
                else:
                    raise MaxTriesExceededError()
                raise
        contentType = headers.get('content-type', '')
        contentEncoding = headers.get('content-encoding', '')
        # Ensure that all sent data is received
        if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers:
            pywikibot.output(u'Warning! len(text) does not match content-length: %s != %s' % \
                (len(text), headers.get('content-length')))
            return self.getPage(url, retry, data, compress, no_hostname, back_response)
        if compress and contentEncoding == 'gzip':
            text = decompress_gzip(text)
        R = re.compile('charset=([^\'\";]+)')
        m = R.search(contentType)
        if m:
            charset = m.group(1)
        else:
            if verbose:
                pywikibot.output(u"WARNING: No character set found.")
            # UTF-8 as default
            charset = 'utf-8'
        # Convert HTML to Unicode
        try:
            text = unicode(text, charset, errors = 'strict')
        except UnicodeDecodeError, e:
            print e
            pywikibot.output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % url )
            # We use error='replace' in case of bad encoding.
            text = unicode(text, charset, errors = 'replace')
        if back_response:
            return f, text
        # pywikibot.output( text )
        return text

    def getEntry( self, code ):
        url = self.pageUrl( code )
        text = self.getPage( url )
        name = self.nameR.search(text)
        if name:
           try:
               name = name.group(1)
           except IndexError:
               name = ''
        tree = self.treeR.search(text)
        if tree:
            try:
                tree = tree.group(1)
            except IndexError:
                tree = '*'
        else:
            tree = 'Other'
        if self.naecR.match(name):
            name = '%s - %s' % ( code, name )
        if self.verbose:
            pywikibot.output( ' == %s == %s == %s ==' % ( code, name, tree ) )
        retval = {
            'code' : code,
            'name' : name,
            'tree' : tree,
        }
        return retval

    def mapCode ( self, code ):
        try:
            code = self.codeMap[ code ]
        except KeyError:
            pass
        return code

    def getList( self, codelist ):
        for code in codelist:
            self.list[code] = self.getEntry( code )

    def list2tree( self ):
        for code in self.list:
            # print self.tree
            this = self.tree
            tree = self.list[code]['tree']
            splt = self.spltR.split( tree )
            for item in splt:
                # pywikibot.output( item )
                if not item in this:
                    this[item] = {}
                this = this[item]
            name = self.list[code]['name']
            if not '' in this:
                this[''] = {}
            if name in this['']:
                pywikibot.output( 'ERROR: %s - Duplicate language name: %s ' % (code, name) )
            this[''][name] = code

    def decommafy(self, name ):
        splt = self.spltR.split( name )
        splt.reverse()
        splt = string.join( splt, ' ' )
        return splt

    def levelIndent(self, level=0 ):
        return '***************************************'[:level]

    def printLine(self, line):
        print( line )

    def printHead(self, line ):
        self.printLine( '\n== %s ==' % line )

    def printLanguage(self, level, code, anchor = ''):
        self.printLine( u"%s'''[[Portal:%s|%s]]'''" % ( self.levelIndent( level ), code, anchor ) )

    def printSortedDict(self, branch, level=0 ):
        data = sorted(branch.keys())
        for name in data:
            if level < 0:
                code = branch[name]
                self.printLanguage( -level, code, self.decommafy( name ) )
            elif name:
                if level:
                    self.printLine( '%s%s' % ( self.levelIndent( level ), name ) )
                else:
                    self.printHead( name )
                self.printSortedDict( branch[name], level+1 )
            else:
                self.printSortedDict( branch[name], -level )

    def run( self ):
        l = [ 'abs', 'abk', 'ksk', 'deu', 'nov', 'ksh' ]
        l = [ 'nds', 'wep', 'stl', 'sdt' ]
        l = [
            'abs', 'abk', 'ksk', 'deu', 'nov', 'eur', 'mul', 'dum', 'cdi', 'cds',
            'cda', 'ksh', 'sxu', 'lim', 'zea', 'gro', 'wep', 'nds', 'drh', 'dre',
            'nld', 'afr', 'lfn', 'drt', 'act', 'dse', 'gos', 'sdz', 'twd', 'stl',
            'vla', 'zea', 'vel', 'rmy', 'rmo', 'fry', 'nds', 'pdt', 'vls', 'dlc',
                ]
        self.getList( l )
        self.list2tree()
        self.printSortedDict( self.tree )
        return
        self.getEntry( 'abs' )
        self.getEntry( 'abk' )
        self.getEntry( 'ksk' )
        self.getEntry( 'deu' )
        self.getEntry( 'nov' )
        self.getEntry( 'eur' )
        self.getEntry( 'mul' )
        self.getEntry( 'dum' )
        self.getEntry( 'cdi' )
        self.getEntry( 'cds' )
        self.getEntry( 'cda' )
        self.getEntry( 'ksh' )


def main(*args):
    app = TranslatorPortalCollector(*args)
    app.run()
    return
    app = EthnologueLanguageTree(*args)
    app.run()
    return

if __name__ == "__main__":
    try:
        main()
    finally:
        pywikibot.stopme()