2012-05-26: Because of issues the with source code management for MediaWiki extensions, no exports of translations can be made at the moment.
Our apologies. All your translations will of course be updated as soon as the issue is resolved. (Other news...)
User:Purodha/translatorlanguages.py
From translatewiki.net
#!/usr/bin/python # -*- coding: utf-8 -*- """ Get translators language portal pages from translatewiki.net """ # # (C) Purodha Blissenbach, 2011. # __version__ = "$Id: editarticle.py 8365 2010-07-26 20:52:10Z purodha $" # # Distributed under the terms of the MIT license. # # Version 1.8. # # Note: This program will take several minutes to read all the language # portals translators pages. # TODO: # find the last edit of a message of a language, but ignore # FUZZYBOT and nontranslators, or excluded credit takers. # Finalize Ethnologue tree __metaclass__ = type import string import re import wikipedia as pywikibot import pagegenerators import config import urllib2 import time class TranslatorPortalCollector: # Fix constants SupportedLanguagesPageTitle = u'Special:SupportedLanguages' ListedLanguagesPageTitle = u'Languages by language family' translators = u'/translators' nsNamePortal = u'Portal' # Computed constants pageLinkSupportedLanguages = None PageLinkListedLanguages = None translalen = None nsPortal = None site = None pageTitle = {} def __init__(self, *args): config.family = 'wikipedia' config.language = 'de' config.family = 'i18n' config.language = 'i18n' self.site = pywikibot.getSite( config.language, config.family ) # pywikibot.output( self.site.sitename() ) self.nsPortal = self.site.getNamespaceIndex(self.nsNamePortal) self.portalen = len(self.nsNamePortal) + 1 self.translalen = len(self.translators) # self.portalsWithRedirectedPortalPage = {} self.portalsWithRedirectedTranslatorsPage = {} self.portalsWithTranslators = {} self.portalsWithNonZeroTranslators = {} self.pagePortalLinks = {} self.SupportedLanguages = {} def getSupportedLanguages(self, pageTitle): self.pageTitle['S'] = pageTitle try: page = pywikibot.Page( self.site, pageTitle ) page = page.urlname() # pywikibot.output( page ) page = '%s?title=%s' % (self.site.path(), page ) # pywikibot.output( page ) text = self.site.getUrl( page ) tags = [ 'comments', 'pre' ] text = pywikibot.removeDisabledParts(text, tags) # pywikibot.output( text ) portalR = re.compile(r'href="[^"]*[\/=][Pp][Oo][Rr][Tt][Aa][Ll]\s?:\s?([a-zA-Z\-]+)"\s?') for lang in portalR.findall(text): lang = lang.lower() # pywikibot.output( u'%s -- %s --' % ( lang, lang ) ) self.SupportedLanguages[lang] = lang except pywikibot.exceptions.PageNotFound: pywikibot.output(u'Error: page "%s" not found.' % pageTitle) def getListedLanguages(self, pageTitle): self.pageTitle['L'] = pageTitle try: page = pywikibot.Page(self.site, pageTitle) if page.isRedirectPage(): page = page.getRedirectTarget() #TEST: self.pagePortalLinks['mil'] = 'mul' tags = ['comments', 'nowiki', 'pre', 'source'] text = pywikibot.removeDisabledParts(page.get(), tags) # pywikibot.output( text ) portalR = re.compile(r'\[\[[Pp][Oo][Rr][Tt][Aa][Ll]\s?:\s?([a-zA-Z\-]+)\s?\|([^\[\]\n]*)\]\]') for lang, pagetitle in portalR.findall(text): lang = lang.lower() #pywikibot.output( u'%s -- %s --' % ( lang, pagetitle ) ) self.pagePortalLinks[lang] = pagetitle except pywikibot.NoPage: pywikibot.output(u'Error: page "%s" not found.' % pageTitle) def getPortalsWithTranslators(self): generator = self.site.allpages(namespace=self.nsPortal) for page in generator: title = page.title() if title[-self.translalen:] == self.translators: lang = title[self.portalen:-self.translalen] if page.isRedirectPage(): self.portalsWithRedirectedTranslatorsPage[lang.lower()] = lang # self.portalsWithRedirectedPortalPage[lang.lower()] = lang else: self.portalsWithTranslators[lang.lower()] = lang def getPortalsWithNonZeroTranslators(self): if not self.portalsWithTranslators: self.getPortalsWithTranslators() for lang in self.portalsWithTranslators: subPageTitle = ( '%s:%s%s' % ( self.nsNamePortal, lang, self.translators ) ) # pywikibot.output( subPageTitle ) try: page = pywikibot.Page( self.site, subPageTitle ) if page.isRedirectPage(): page = page.getRedirectTarget() tags = ['comments', 'nowiki', 'pre', 'source'] text = pywikibot.removeDisabledParts(page.get(), tags) # pywikibot.output( text ) userLinkR = re.compile(r'\[\[[Uu][Ss][Ee][Rr]\s?:([^\[\]\n]*)\]\]') for user in userLinkR.findall(text): try: self.portalsWithNonZeroTranslators[lang] = self.portalsWithNonZeroTranslators[lang] + 1 except KeyError: self.portalsWithNonZeroTranslators[lang] = 1 userTemplateR = re.compile(r'\{\{[Uu]ser\s?\|([^\[\]\n]*)\}\}') for user in userTemplateR.findall(text): try: self.portalsWithNonZeroTranslators[lang] = self.portalsWithNonZeroTranslators[lang] + 1 except KeyError: self.portalsWithNonZeroTranslators[lang] = 1 except pywikibot.NoPage: pywikibot.output(u'Error: page "%s" disappeared.' % subPageTitle) def pageLink(self, title, anchor ): return ( '[[%s|%s]]' % (title, anchor) ) def printLine(self, line): print( line ) def printHeadLine(self, line ): Template = re.compile( r'(\{([LPST])\:([^}]*)\})' ) for matched, key, anchor in Template.findall( line ): # pywikibot.output( ' %s %s %s ' % ( matched, key, anchor ) ) try: anchor = self.pageLink( self.pageTitle[key], anchor) except KeyError: pass line = line.replace( matched, anchor ) self.printLine( '== %s ==' % line ) def printLanguage(self, lang, anchor = ''): # pywikibot.output( u'# [[Portal:%s|%s]]' % ( lang, anchor ) ) self.printLine( u'# [[Portal:%s|%s]]' % ( lang, anchor ) ) def printSortedLanguageList(self, langlist): languages = sorted(langlist.keys()) for lang in languages: self.printLanguage(lang) def printSortedLanguageListDifference(self, langlist, exceptdict ): languages = sorted(langlist.keys()) for lang in languages: if not lang in exceptdict: self.printLanguage(lang) return # try: # if exceptdict[lang]: # continue # except KeyError: # self.printLanguage(lang) def showLanguagesWithTranslators(self): self.printHeadLine( u'Languages with {T:translators subpages}') self.printSortedLanguageList( self.portalsWithTranslators ) def showLanguagesWithNonZeroTranslators(self): self.printHeadLine( u'Languages with nonzero translators') self.printSortedLanguageList( self.portalsWithNonZeroTranslators ) def showLanguagesWithZeroTranslators(self): self.printHeadLine( u'Languages with {T:translators subpages} and zero translators') self.printSortedLanguageListDifference( self.portalsWithTranslators, self.portalsWithNonZeroTranslators ) def showListedLanguages(self): self.printHeadLine( u'{L:Listed Languages}') self.printSortedLanguageList( self.pagePortalLinks ) def showListedLanguagesWithoutTranslators(self): self.printHeadLine( u'{L:Listed languages} without {T:translators subpages}') self.printSortedLanguageListDifference( self.pagePortalLinks, self.portalsWithTranslators ) def showListedLanguagesWithZeroTranslators(self): self.printHeadLine( u'{L:Listed languages} with zero translators') self.printSortedLanguageListDifference( self.pagePortalLinks, self.portalsWithNonZeroTranslators ) def showLanguagesNotListedDespiteTranslators(self): self.printHeadLine( u'Languages not {L:listed} despite translators') self.printSortedLanguageListDifference( self.portalsWithNonZeroTranslators, self.pagePortalLinks ) def showLanguagesdWithRedirectedPortalPage(self): self.printHeadLine( u'Languages with {P:Portal page} being a redirect') self.printSortedLanguageList( self.portalsWithRedirectedPortalPage ) def showLanguagesdWithRedirectedTranslatorsPage(self): self.printHeadLine( u'Languages with {T:translators subpage} being a redirect') self.printSortedLanguageList( self.portalsWithRedirectedTranslatorsPage ) def showSupportedLanguages(self): self.printHeadLine( u'{S:Supported Languages}') self.printSortedLanguageList( self.SupportedLanguages ) def showListedLanguagesNotSupported(self): self.printHeadLine( u'Languages {L:listed} but not {S:supported}') self.printSortedLanguageListDifference( self.pagePortalLinks, self.SupportedLanguages ) def showSupportedLanguagesNotListed(self): self.printHeadLine( u'Languages {S:supported} but not {L:listed}' ) self.printSortedLanguageListDifference( self.SupportedLanguages, self.pagePortalLinks ) def run(self): self.getSupportedLanguages( self.SupportedLanguagesPageTitle) self.showSupportedLanguages() self.getListedLanguages( self.ListedLanguagesPageTitle ) self.showListedLanguages() self.getPortalsWithNonZeroTranslators() self.showLanguagesWithTranslators() self.showLanguagesWithNonZeroTranslators() self.showLanguagesWithZeroTranslators() # self.showLanguagesdWithRedirectedPortalPage() self.showLanguagesdWithRedirectedTranslatorsPage() self.showListedLanguagesWithoutTranslators() self.showListedLanguagesWithZeroTranslators() self.showLanguagesNotListedDespiteTranslators() self.showListedLanguagesNotSupported() self.showSupportedLanguagesNotListed() return # cookieProcessor = urllib2.HTTPCookieProcessor(cj) MyURLopener = urllib2.build_opener(pywikibot.U2RedirectHandler) class EthnologueLanguageTree: baseURL = 'http://www.ethnologue.com/show_language.asp?code=' verbose = True codeMap = {} list = {} tree = {} def __init__(self, *args): # self.nameR = re.compile(r'<h1>') self.nameR = re.compile(r'<h1>\s*([^<]+?)\s*<\/h1>') self.treeR = re.compile(r'(?s)>Classification<\/a>.*?<a\s[^>]+>\s*([^<]+?)\s*?<\/a>') self.spltR = re.compile(r'\s*,\s*') self.naecR = re.compile(r'Not an Ethnologue[ 0-9]*language code|Invalid language code|New language identifier code') def pageUrl( self, code ): return ( '%s%s' % ( self.baseURL, code ) ) def getPage( self, url, retry=True, data='', compress=True, no_hostname=True, back_response=False, verbose=verbose ): if retry: retry_attempt = config.maxretries else: retry_attempt = 1 retry_idle_time = 1 while True: try: request = urllib2.Request(url) #, data, headers) f = MyURLopener.open(request) # read & info can raise socket.error text = f.read() # pywikibot.output( text ) headers = f.info() # pywikibot.output( headers ) break except KeyboardInterrupt: raise except urllib2.HTTPError, e: if e.code in [401, 404]: raise PageNotFound( u'Page %s could not be retrieved.' % url) elif e.code in [403]: raise PageNotFound( u'Page %s could not be retrieved. Check your virus wall.' % url) elif e.code == 504: output(u'HTTPError: %s %s' % (e.code, e.msg)) if retry_attempt > 0: retry_attempt -= 1 output( u"WARNING: Could not open '%s'.\n Maybe the server or your connection is down. Retrying in %i minutes..." % (url, retry_idle_time)) time.sleep(retry_idle_time * 60) # Next time wait longer, # but not longer than half an hour retry_idle_time *= 2 if retry_idle_time > 30: retry_idle_time = 30 continue else: raise MaxTriesExceededError() raise else: output(u"Result: %s %s" % (e.code, e.msg)) raise except Exception, e: pywikibot.output(u'%s' %e) if retry_attempt > 0: retry_attempt -= 1 pywikibot.output( u"WARNING: Could not open '%s'.\n Maybe the server or your connection is down. Retrying in %i minutes..." % (url, retry_idle_time)) time.sleep(retry_idle_time * 60) retry_idle_time *= 2 if retry_idle_time > 30: retry_idle_time = 30 continue else: raise MaxTriesExceededError() raise contentType = headers.get('content-type', '') contentEncoding = headers.get('content-encoding', '') # Ensure that all sent data is received if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers: pywikibot.output(u'Warning! len(text) does not match content-length: %s != %s' % \ (len(text), headers.get('content-length'))) return self.getPage(url, retry, data, compress, no_hostname, back_response) if compress and contentEncoding == 'gzip': text = decompress_gzip(text) R = re.compile('charset=([^\'\";]+)') m = R.search(contentType) if m: charset = m.group(1) else: if verbose: pywikibot.output(u"WARNING: No character set found.") # UTF-8 as default charset = 'utf-8' # Convert HTML to Unicode try: text = unicode(text, charset, errors = 'strict') except UnicodeDecodeError, e: print e pywikibot.output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % url ) # We use error='replace' in case of bad encoding. text = unicode(text, charset, errors = 'replace') if back_response: return f, text # pywikibot.output( text ) return text def getEntry( self, code ): url = self.pageUrl( code ) text = self.getPage( url ) name = self.nameR.search(text) if name: try: name = name.group(1) except IndexError: name = '' tree = self.treeR.search(text) if tree: try: tree = tree.group(1) except IndexError: tree = '*' else: tree = 'Other' if self.naecR.match(name): name = '%s - %s' % ( code, name ) if self.verbose: pywikibot.output( ' == %s == %s == %s ==' % ( code, name, tree ) ) retval = { 'code' : code, 'name' : name, 'tree' : tree, } return retval def mapCode ( self, code ): try: code = self.codeMap[ code ] except KeyError: pass return code def getList( self, codelist ): for code in codelist: self.list[code] = self.getEntry( code ) def list2tree( self ): for code in self.list: # print self.tree this = self.tree tree = self.list[code]['tree'] splt = self.spltR.split( tree ) for item in splt: # pywikibot.output( item ) if not item in this: this[item] = {} this = this[item] name = self.list[code]['name'] if not '' in this: this[''] = {} if name in this['']: pywikibot.output( 'ERROR: %s - Duplicate language name: %s ' % (code, name) ) this[''][name] = code def decommafy(self, name ): splt = self.spltR.split( name ) splt.reverse() splt = string.join( splt, ' ' ) return splt def levelIndent(self, level=0 ): return '***************************************'[:level] def printLine(self, line): print( line ) def printHead(self, line ): self.printLine( '\n== %s ==' % line ) def printLanguage(self, level, code, anchor = ''): self.printLine( u"%s'''[[Portal:%s|%s]]'''" % ( self.levelIndent( level ), code, anchor ) ) def printSortedDict(self, branch, level=0 ): data = sorted(branch.keys()) for name in data: if level < 0: code = branch[name] self.printLanguage( -level, code, self.decommafy( name ) ) elif name: if level: self.printLine( '%s%s' % ( self.levelIndent( level ), name ) ) else: self.printHead( name ) self.printSortedDict( branch[name], level+1 ) else: self.printSortedDict( branch[name], -level ) def run( self ): l = [ 'abs', 'abk', 'ksk', 'deu', 'nov', 'ksh' ] l = [ 'nds', 'wep', 'stl', 'sdt' ] l = [ 'abs', 'abk', 'ksk', 'deu', 'nov', 'eur', 'mul', 'dum', 'cdi', 'cds', 'cda', 'ksh', 'sxu', 'lim', 'zea', 'gro', 'wep', 'nds', 'drh', 'dre', 'nld', 'afr', 'lfn', 'drt', 'act', 'dse', 'gos', 'sdz', 'twd', 'stl', 'vla', 'zea', 'vel', 'rmy', 'rmo', 'fry', 'nds', 'pdt', 'vls', 'dlc', ] self.getList( l ) self.list2tree() self.printSortedDict( self.tree ) return self.getEntry( 'abs' ) self.getEntry( 'abk' ) self.getEntry( 'ksk' ) self.getEntry( 'deu' ) self.getEntry( 'nov' ) self.getEntry( 'eur' ) self.getEntry( 'mul' ) self.getEntry( 'dum' ) self.getEntry( 'cdi' ) self.getEntry( 'cds' ) self.getEntry( 'cda' ) self.getEntry( 'ksh' ) def main(*args): app = TranslatorPortalCollector(*args) app.run() return app = EthnologueLanguageTree(*args) app.run() return if __name__ == "__main__": try: main() finally: pywikibot.stopme()