User:Purodha/translatorlanguages.py
Appearance
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Get translators language portal pages from translatewiki.net
"""
#
# (C) Purodha Blissenbach, 2011.
#
__version__ = "$Id: editarticle.py 8365 2010-07-26 20:52:10Z purodha $"
#
# Distributed under the terms of the MIT license.
#
# Version 1.8.
#
# Note: This program will take several minutes to read all the language
# portals translators pages.
# TODO:
# find the last edit of a message of a language, but ignore
# FUZZYBOT and nontranslators, or excluded credit takers.
# Finalize Ethnologue tree
__metaclass__ = type
import string
import re
import wikipedia as pywikibot
import pagegenerators
import config
import urllib2
import time
class TranslatorPortalCollector:
# Fix constants
SupportedLanguagesPageTitle = u'Special:SupportedLanguages'
ListedLanguagesPageTitle = u'Languages by language family'
translators = u'/translators'
nsNamePortal = u'Portal'
# Computed constants
pageLinkSupportedLanguages = None
PageLinkListedLanguages = None
translalen = None
nsPortal = None
site = None
pageTitle = {}
def __init__(self, *args):
config.family = 'wikipedia'
config.language = 'de'
config.family = 'i18n'
config.language = 'i18n'
self.site = pywikibot.getSite( config.language, config.family )
# pywikibot.output( self.site.sitename() )
self.nsPortal = self.site.getNamespaceIndex(self.nsNamePortal)
self.portalen = len(self.nsNamePortal) + 1
self.translalen = len(self.translators)
# self.portalsWithRedirectedPortalPage = {}
self.portalsWithRedirectedTranslatorsPage = {}
self.portalsWithTranslators = {}
self.portalsWithNonZeroTranslators = {}
self.pagePortalLinks = {}
self.SupportedLanguages = {}
def getSupportedLanguages(self, pageTitle):
self.pageTitle['S'] = pageTitle
try:
page = pywikibot.Page( self.site, pageTitle )
page = page.urlname()
# pywikibot.output( page )
page = '%s?title=%s' % (self.site.path(), page )
# pywikibot.output( page )
text = self.site.getUrl( page )
tags = [ 'comments', 'pre' ]
text = pywikibot.removeDisabledParts(text, tags)
# pywikibot.output( text )
portalR = re.compile(r'href="[^"]*[\/=][Pp][Oo][Rr][Tt][Aa][Ll]\s?:\s?([a-zA-Z\-]+)"\s?')
for lang in portalR.findall(text):
lang = lang.lower()
# pywikibot.output( u'%s -- %s --' % ( lang, lang ) )
self.SupportedLanguages[lang] = lang
except pywikibot.exceptions.PageNotFound:
pywikibot.output(u'Error: page "%s" not found.' % pageTitle)
def getListedLanguages(self, pageTitle):
self.pageTitle['L'] = pageTitle
try:
page = pywikibot.Page(self.site, pageTitle)
if page.isRedirectPage():
page = page.getRedirectTarget()
#TEST: self.pagePortalLinks['mil'] = 'mul'
tags = ['comments', 'nowiki', 'pre', 'source']
text = pywikibot.removeDisabledParts(page.get(), tags)
# pywikibot.output( text )
portalR = re.compile(r'\[\[[Pp][Oo][Rr][Tt][Aa][Ll]\s?:\s?([a-zA-Z\-]+)\s?\|([^\[\]\n]*)\]\]')
for lang, pagetitle in portalR.findall(text):
lang = lang.lower()
#pywikibot.output( u'%s -- %s --' % ( lang, pagetitle ) )
self.pagePortalLinks[lang] = pagetitle
except pywikibot.NoPage:
pywikibot.output(u'Error: page "%s" not found.' % pageTitle)
def getPortalsWithTranslators(self):
generator = self.site.allpages(namespace=self.nsPortal)
for page in generator:
title = page.title()
if title[-self.translalen:] == self.translators:
lang = title[self.portalen:-self.translalen]
if page.isRedirectPage():
self.portalsWithRedirectedTranslatorsPage[lang.lower()] = lang
# self.portalsWithRedirectedPortalPage[lang.lower()] = lang
else:
self.portalsWithTranslators[lang.lower()] = lang
def getPortalsWithNonZeroTranslators(self):
if not self.portalsWithTranslators:
self.getPortalsWithTranslators()
for lang in self.portalsWithTranslators:
subPageTitle = ( '%s:%s%s' % ( self.nsNamePortal, lang, self.translators ) )
# pywikibot.output( subPageTitle )
try:
page = pywikibot.Page( self.site, subPageTitle )
if page.isRedirectPage():
page = page.getRedirectTarget()
tags = ['comments', 'nowiki', 'pre', 'source']
text = pywikibot.removeDisabledParts(page.get(), tags)
# pywikibot.output( text )
userLinkR = re.compile(r'\[\[[Uu][Ss][Ee][Rr]\s?:([^\[\]\n]*)\]\]')
for user in userLinkR.findall(text):
try:
self.portalsWithNonZeroTranslators[lang] = self.portalsWithNonZeroTranslators[lang] + 1
except KeyError:
self.portalsWithNonZeroTranslators[lang] = 1
userTemplateR = re.compile(r'\{\{[Uu]ser\s?\|([^\[\]\n]*)\}\}')
for user in userTemplateR.findall(text):
try:
self.portalsWithNonZeroTranslators[lang] = self.portalsWithNonZeroTranslators[lang] + 1
except KeyError:
self.portalsWithNonZeroTranslators[lang] = 1
except pywikibot.NoPage:
pywikibot.output(u'Error: page "%s" disappeared.' % subPageTitle)
def pageLink(self, title, anchor ):
return ( '[[%s|%s]]' % (title, anchor) )
def printLine(self, line):
print( line )
def printHeadLine(self, line ):
Template = re.compile( r'(\{([LPST])\:([^}]*)\})' )
for matched, key, anchor in Template.findall( line ):
# pywikibot.output( ' %s %s %s ' % ( matched, key, anchor ) )
try:
anchor = self.pageLink( self.pageTitle[key], anchor)
except KeyError:
pass
line = line.replace( matched, anchor )
self.printLine( '== %s ==' % line )
def printLanguage(self, lang, anchor = ''):
# pywikibot.output( u'# [[Portal:%s|%s]]' % ( lang, anchor ) )
self.printLine( u'# [[Portal:%s|%s]]' % ( lang, anchor ) )
def printSortedLanguageList(self, langlist):
languages = sorted(langlist.keys())
for lang in languages:
self.printLanguage(lang)
def printSortedLanguageListDifference(self, langlist, exceptdict ):
languages = sorted(langlist.keys())
for lang in languages:
if not lang in exceptdict:
self.printLanguage(lang)
return
# try:
# if exceptdict[lang]:
# continue
# except KeyError:
# self.printLanguage(lang)
def showLanguagesWithTranslators(self):
self.printHeadLine( u'Languages with {T:translators subpages}')
self.printSortedLanguageList( self.portalsWithTranslators )
def showLanguagesWithNonZeroTranslators(self):
self.printHeadLine( u'Languages with nonzero translators')
self.printSortedLanguageList( self.portalsWithNonZeroTranslators )
def showLanguagesWithZeroTranslators(self):
self.printHeadLine( u'Languages with {T:translators subpages} and zero translators')
self.printSortedLanguageListDifference( self.portalsWithTranslators, self.portalsWithNonZeroTranslators )
def showListedLanguages(self):
self.printHeadLine( u'{L:Listed Languages}')
self.printSortedLanguageList( self.pagePortalLinks )
def showListedLanguagesWithoutTranslators(self):
self.printHeadLine( u'{L:Listed languages} without {T:translators subpages}')
self.printSortedLanguageListDifference( self.pagePortalLinks, self.portalsWithTranslators )
def showListedLanguagesWithZeroTranslators(self):
self.printHeadLine( u'{L:Listed languages} with zero translators')
self.printSortedLanguageListDifference( self.pagePortalLinks, self.portalsWithNonZeroTranslators )
def showLanguagesNotListedDespiteTranslators(self):
self.printHeadLine( u'Languages not {L:listed} despite translators')
self.printSortedLanguageListDifference( self.portalsWithNonZeroTranslators, self.pagePortalLinks )
def showLanguagesdWithRedirectedPortalPage(self):
self.printHeadLine( u'Languages with {P:Portal page} being a redirect')
self.printSortedLanguageList( self.portalsWithRedirectedPortalPage )
def showLanguagesdWithRedirectedTranslatorsPage(self):
self.printHeadLine( u'Languages with {T:translators subpage} being a redirect')
self.printSortedLanguageList( self.portalsWithRedirectedTranslatorsPage )
def showSupportedLanguages(self):
self.printHeadLine( u'{S:Supported Languages}')
self.printSortedLanguageList( self.SupportedLanguages )
def showListedLanguagesNotSupported(self):
self.printHeadLine( u'Languages {L:listed} but not {S:supported}')
self.printSortedLanguageListDifference( self.pagePortalLinks, self.SupportedLanguages )
def showSupportedLanguagesNotListed(self):
self.printHeadLine( u'Languages {S:supported} but not {L:listed}' )
self.printSortedLanguageListDifference( self.SupportedLanguages, self.pagePortalLinks )
def run(self):
self.getSupportedLanguages( self.SupportedLanguagesPageTitle)
self.showSupportedLanguages()
self.getListedLanguages( self.ListedLanguagesPageTitle )
self.showListedLanguages()
self.getPortalsWithNonZeroTranslators()
self.showLanguagesWithTranslators()
self.showLanguagesWithNonZeroTranslators()
self.showLanguagesWithZeroTranslators()
# self.showLanguagesdWithRedirectedPortalPage()
self.showLanguagesdWithRedirectedTranslatorsPage()
self.showListedLanguagesWithoutTranslators()
self.showListedLanguagesWithZeroTranslators()
self.showLanguagesNotListedDespiteTranslators()
self.showListedLanguagesNotSupported()
self.showSupportedLanguagesNotListed()
return
# cookieProcessor = urllib2.HTTPCookieProcessor(cj)
MyURLopener = urllib2.build_opener(pywikibot.U2RedirectHandler)
class EthnologueLanguageTree:
baseURL = 'http://www.ethnologue.com/show_language.asp?code='
verbose = True
codeMap = {}
list = {}
tree = {}
def __init__(self, *args):
# self.nameR = re.compile(r'<h1>')
self.nameR = re.compile(r'<h1>\s*([^<]+?)\s*<\/h1>')
self.treeR = re.compile(r'(?s)>Classification<\/a>.*?<a\s[^>]+>\s*([^<]+?)\s*?<\/a>')
self.spltR = re.compile(r'\s*,\s*')
self.naecR = re.compile(r'Not an Ethnologue[ 0-9]*language code|Invalid language code|New language identifier code')
def pageUrl( self, code ):
return ( '%s%s' % ( self.baseURL, code ) )
def getPage( self, url, retry=True, data='', compress=True, no_hostname=True, back_response=False, verbose=verbose ):
if retry:
retry_attempt = config.maxretries
else:
retry_attempt = 1
retry_idle_time = 1
while True:
try:
request = urllib2.Request(url) #, data, headers)
f = MyURLopener.open(request)
# read & info can raise socket.error
text = f.read()
# pywikibot.output( text )
headers = f.info()
# pywikibot.output( headers )
break
except KeyboardInterrupt:
raise
except urllib2.HTTPError, e:
if e.code in [401, 404]:
raise PageNotFound( u'Page %s could not be retrieved.' % url)
elif e.code in [403]:
raise PageNotFound( u'Page %s could not be retrieved. Check your virus wall.' % url)
elif e.code == 504:
output(u'HTTPError: %s %s' % (e.code, e.msg))
if retry_attempt > 0:
retry_attempt -= 1
output(
u"WARNING: Could not open '%s'.\n Maybe the server or your connection is down. Retrying in %i minutes..."
% (url, retry_idle_time))
time.sleep(retry_idle_time * 60)
# Next time wait longer,
# but not longer than half an hour
retry_idle_time *= 2
if retry_idle_time > 30:
retry_idle_time = 30
continue
else:
raise MaxTriesExceededError()
raise
else:
output(u"Result: %s %s" % (e.code, e.msg))
raise
except Exception, e:
pywikibot.output(u'%s' %e)
if retry_attempt > 0:
retry_attempt -= 1
pywikibot.output(
u"WARNING: Could not open '%s'.\n Maybe the server or your connection is down. Retrying in %i minutes..."
% (url, retry_idle_time))
time.sleep(retry_idle_time * 60)
retry_idle_time *= 2
if retry_idle_time > 30:
retry_idle_time = 30
continue
else:
raise MaxTriesExceededError()
raise
contentType = headers.get('content-type', '')
contentEncoding = headers.get('content-encoding', '')
# Ensure that all sent data is received
if int(headers.get('content-length', '0')) != len(text) and 'content-length' in headers:
pywikibot.output(u'Warning! len(text) does not match content-length: %s != %s' % \
(len(text), headers.get('content-length')))
return self.getPage(url, retry, data, compress, no_hostname, back_response)
if compress and contentEncoding == 'gzip':
text = decompress_gzip(text)
R = re.compile('charset=([^\'\";]+)')
m = R.search(contentType)
if m:
charset = m.group(1)
else:
if verbose:
pywikibot.output(u"WARNING: No character set found.")
# UTF-8 as default
charset = 'utf-8'
# Convert HTML to Unicode
try:
text = unicode(text, charset, errors = 'strict')
except UnicodeDecodeError, e:
print e
pywikibot.output(u'ERROR: Invalid characters found on %s, replaced by \\ufffd.' % url )
# We use error='replace' in case of bad encoding.
text = unicode(text, charset, errors = 'replace')
if back_response:
return f, text
# pywikibot.output( text )
return text
def getEntry( self, code ):
url = self.pageUrl( code )
text = self.getPage( url )
name = self.nameR.search(text)
if name:
try:
name = name.group(1)
except IndexError:
name = ''
tree = self.treeR.search(text)
if tree:
try:
tree = tree.group(1)
except IndexError:
tree = '*'
else:
tree = 'Other'
if self.naecR.match(name):
name = '%s - %s' % ( code, name )
if self.verbose:
pywikibot.output( ' == %s == %s == %s ==' % ( code, name, tree ) )
retval = {
'code' : code,
'name' : name,
'tree' : tree,
}
return retval
def mapCode ( self, code ):
try:
code = self.codeMap[ code ]
except KeyError:
pass
return code
def getList( self, codelist ):
for code in codelist:
self.list[code] = self.getEntry( code )
def list2tree( self ):
for code in self.list:
# print self.tree
this = self.tree
tree = self.list[code]['tree']
splt = self.spltR.split( tree )
for item in splt:
# pywikibot.output( item )
if not item in this:
this[item] = {}
this = this[item]
name = self.list[code]['name']
if not '' in this:
this[''] = {}
if name in this['']:
pywikibot.output( 'ERROR: %s - Duplicate language name: %s ' % (code, name) )
this[''][name] = code
def decommafy(self, name ):
splt = self.spltR.split( name )
splt.reverse()
splt = string.join( splt, ' ' )
return splt
def levelIndent(self, level=0 ):
return '***************************************'[:level]
def printLine(self, line):
print( line )
def printHead(self, line ):
self.printLine( '\n== %s ==' % line )
def printLanguage(self, level, code, anchor = ''):
self.printLine( u"%s'''[[Portal:%s|%s]]'''" % ( self.levelIndent( level ), code, anchor ) )
def printSortedDict(self, branch, level=0 ):
data = sorted(branch.keys())
for name in data:
if level < 0:
code = branch[name]
self.printLanguage( -level, code, self.decommafy( name ) )
elif name:
if level:
self.printLine( '%s%s' % ( self.levelIndent( level ), name ) )
else:
self.printHead( name )
self.printSortedDict( branch[name], level+1 )
else:
self.printSortedDict( branch[name], -level )
def run( self ):
l = [ 'abs', 'abk', 'ksk', 'deu', 'nov', 'ksh' ]
l = [ 'nds', 'wep', 'stl', 'sdt' ]
l = [
'abs', 'abk', 'ksk', 'deu', 'nov', 'eur', 'mul', 'dum', 'cdi', 'cds',
'cda', 'ksh', 'sxu', 'lim', 'zea', 'gro', 'wep', 'nds', 'drh', 'dre',
'nld', 'afr', 'lfn', 'drt', 'act', 'dse', 'gos', 'sdz', 'twd', 'stl',
'vla', 'zea', 'vel', 'rmy', 'rmo', 'fry', 'nds', 'pdt', 'vls', 'dlc',
]
self.getList( l )
self.list2tree()
self.printSortedDict( self.tree )
return
self.getEntry( 'abs' )
self.getEntry( 'abk' )
self.getEntry( 'ksk' )
self.getEntry( 'deu' )
self.getEntry( 'nov' )
self.getEntry( 'eur' )
self.getEntry( 'mul' )
self.getEntry( 'dum' )
self.getEntry( 'cdi' )
self.getEntry( 'cds' )
self.getEntry( 'cda' )
self.getEntry( 'ksh' )
def main(*args):
app = TranslatorPortalCollector(*args)
app.run()
return
app = EthnologueLanguageTree(*args)
app.run()
return
if __name__ == "__main__":
try:
main()
finally:
pywikibot.stopme()