GettextFFS.php

Go to the documentation of this file.
00001 <?php
00015 class GettextPluralException extends MwException {
00016 }
00017 
00022 class GettextFFS extends SimpleFFS {
00023     public function supportsFuzzy() {
00024         return 'yes';
00025     }
00026 
00027     public function getFileExtensions() {
00028         return array( '.pot', '.po' );
00029     }
00030 
00031     protected $offlineMode = false;
00032 
00036     public function setOfflineMode( $value ) {
00037         $this->offlineMode = $value;
00038     }
00039 
00040     public function readFromVariable( $data ) {
00041         # Authors first
00042         $matches = array();
00043         preg_match_all( '/^#\s*Author:\s*(.*)$/m', $data, $matches );
00044         $authors = $matches[1];
00045 
00046         # Then messages and everything else
00047         $parsedData = $this->parseGettext( $data );
00048         $parsedData['AUTHORS'] = $authors;
00049 
00050         foreach ( $parsedData['MESSAGES'] as $key => $value ) {
00051             if ( $value === '' ) {
00052                 unset( $parsedData['MESSAGES'][$key] );
00053             }
00054         }
00055 
00056         return $parsedData;
00057     }
00058 
00059     public function parseGettext( $data ) {
00060         $mangler = $this->group->getMangler();
00061         $useCtxtAsKey = isset( $this->extra['CtxtAsKey'] ) && $this->extra['CtxtAsKey'];
00062         $keyAlgorithm = 'legacy';
00063         if ( isset( $this->extra['keyAlgorithm'] ) ) {
00064             $keyAlgorithm = $this->extra['keyAlgorithm'];
00065         }
00066 
00067         return self::parseGettextData( $data, $useCtxtAsKey, $mangler, $keyAlgorithm );
00068     }
00069 
00080     public static function parseGettextData( $data, $useCtxtAsKey, $mangler, $keyAlgorithm ) {
00081         $potmode = false;
00082 
00083         // Normalise newlines, to make processing easier
00084         $data = str_replace( "\r\n", "\n", $data );
00085 
00086         /* Delimit the file into sections, which are separated by two newlines.
00087          * We are permissive and accept more than two. This parsing method isn't
00088          * efficient wrt memory, but was easy to implement */
00089         $sections = preg_split( '/\n{2,}/', $data );
00090 
00091         /* First one isn't an actual message. We'll handle it specially below */
00092         $headerSection = array_shift( $sections );
00093         /* Since this is the header section, we are only interested in the tags
00094          * and msgid is empty. Somewhere we should extract the header comments
00095          * too */
00096         $match = self::expectKeyword( 'msgstr', $headerSection );
00097         if ( $match !== null ) {
00098             $headerBlock = self::formatForWiki( $match, 'trim' );
00099             $headers = self::parseHeaderTags( $headerBlock );
00100 
00101             // Check for pot-mode by checking if the header is fuzzy
00102             $flags = self::parseFlags( $headerSection );
00103             if ( in_array( 'fuzzy', $flags, true ) ) {
00104                 $potmode = true;
00105             }
00106         } else {
00107             throw new MWException( "Gettext file header was not found:\n\n$data" );
00108         }
00109 
00110         $template = array();
00111         $messages = array();
00112 
00113         // Extract some metadata from headers for easier use
00114         $metadata = array();
00115         if ( isset( $headers['X-Language-Code'] ) ) {
00116             $metadata['code'] = $headers['X-Language-Code'];
00117         }
00118 
00119         if ( isset( $headers['X-Message-Group'] ) ) {
00120             $metadata['group'] = $headers['X-Message-Group'];
00121         }
00122 
00123         /* At this stage we are only interested how many plurals forms we should
00124          * be expecting when parsing the rest of this file. */
00125         $pluralCount = false;
00126         if ( isset( $headers['Plural-Forms'] ) ) {
00127             if ( preg_match( '/nplurals=([0-9]+).*;/', $headers['Plural-Forms'], $matches ) ) {
00128                 $pluralCount = $metadata['plural'] = $matches[1];
00129             }
00130         }
00131 
00132         // Then parse the messages
00133         foreach ( $sections as $section ) {
00134 
00135             $item = self::parseGettextSection( $section, $pluralCount, $metadata );
00136             if ( $item === false ) {
00137                 continue;
00138             }
00139 
00140             if ( $useCtxtAsKey ) {
00141                 if ( !isset( $item['ctxt'] ) ) {
00142                     error_log( "ctxt missing for: $section" );
00143                     continue;
00144                 }
00145                 $key = $item['ctxt'];
00146             } else {
00147                 $key = self::generateKeyFromItem( $item, $keyAlgorithm );
00148             }
00149 
00150             $key = $mangler->mangle( $key );
00151             $messages[$key] = $potmode ? $item['id'] : $item['str'];
00152             $template[$key] = $item;
00153         }
00154 
00155         return array(
00156             'MESSAGES' => $messages,
00157             'TEMPLATE' => $template,
00158             'METADATA' => $metadata,
00159             'HEADERS' => $headers
00160         );
00161     }
00162 
00163     public static function parseGettextSection( $section, $pluralCount, &$metadata ) {
00164 
00165         if ( trim( $section ) === '' ) {
00166             return false;
00167         }
00168 
00169         /* These inactive sections are of no interest to us. Multiline mode
00170          * is needed because there may be flags or other annoying stuff
00171          * before the commented out sections.
00172          */
00173         if ( preg_match( '/^#~/m', $section ) ) {
00174             return false;
00175         }
00176 
00177         $item = array(
00178             'ctxt' => false,
00179             'id' => '',
00180             'str' => '',
00181             'flags' => array(),
00182             'comments' => array(),
00183         );
00184 
00185         $match = self::expectKeyword( 'msgid', $section );
00186         if ( $match !== null ) {
00187             $item['id'] = self::formatForWiki( $match );
00188         } else {
00189             throw new MWException( "Unable to parse msgid:\n\n$section" );
00190         }
00191 
00192         $match = self::expectKeyword( 'msgctxt', $section );
00193         if ( $match !== null ) {
00194             $item['ctxt'] = self::formatForWiki( $match );
00195         }
00196 
00197         $pluralMessage = false;
00198         $match = self::expectKeyword( 'msgid_plural', $section );
00199         if ( $match !== null ) {
00200             $pluralMessage = true;
00201             $plural = self::formatForWiki( $match );
00202             $item['id'] = "{{PLURAL:GETTEXT|{$item['id']}|$plural}}";
00203         }
00204 
00205         if ( $pluralMessage ) {
00206             $pluralMessageText = self::processGettextPluralMessage( $pluralCount, $section );
00207 
00208             // Keep the translation empty if no form has translation
00209             if ( $pluralMessageText !== '' ) {
00210                 $item['str'] = $pluralMessageText;
00211             }
00212         } else {
00213             $match = self::expectKeyword( 'msgstr', $section );
00214             if ( $match !== null ) {
00215                 $item['str'] = self::formatForWiki( $match );
00216             } else {
00217                 throw new MWException( "Unable to parse msgstr:\n\n$section" );
00218             }
00219         }
00220 
00221         // Parse flags
00222         $flags = self::parseFlags( $section );
00223         foreach ( $flags as $key => $flag ) {
00224             if ( $flag === 'fuzzy' ) {
00225                 $item['str'] = TRANSLATE_FUZZY . $item['str'];
00226                 unset( $flags[$key] );
00227             }
00228         }
00229         $item['flags'] = $flags;
00230 
00231         // Rest of the comments
00232         $matches = array();
00233         if ( preg_match_all( '/^#(.?) (.*)$/m', $section, $matches, PREG_SET_ORDER ) ) {
00234             foreach ( $matches as $match ) {
00235                 if ( $match[1] !== ',' && strpos( $match[1], '[Wiki]' ) !== 0 ) {
00236                     $item['comments'][$match[1]][] = $match[2];
00237                 }
00238             }
00239         }
00240 
00241         return $item;
00242     }
00243 
00244     public static function processGettextPluralMessage( $pluralCount, $section ) {
00245         $actualForms = array();
00246 
00247         for ( $i = 0; $i < $pluralCount; $i++ ) {
00248             $match = self::expectKeyword( "msgstr\\[$i\\]", $section );
00249 
00250             if ( $match !== null ) {
00251                 $actualForms[] = self::formatForWiki( $match );
00252             } else {
00253                 $actualForms[] = '';
00254                 error_log( "Plural $i not found, expecting total of $pluralCount for $section" );
00255             }
00256         }
00257 
00258         if ( array_sum( array_map( 'strlen', $actualForms ) ) > 0 ) {
00259             return '{{PLURAL:GETTEXT|' . implode( '|', $actualForms ) . '}}';
00260         } else {
00261             return '';
00262         }
00263     }
00264 
00265     public static function parseFlags( $section ) {
00266         $matches = array();
00267         if ( preg_match( '/^#,(.*)$/mu', $section, $matches ) ) {
00268             return array_map( 'trim', explode( ',', $matches[1] ) );
00269         } else {
00270             return array();
00271         }
00272     }
00273 
00274     public static function expectKeyword( $name, $section ) {
00275         /* Catches the multiline textblock that comes after keywords msgid,
00276          * msgstr, msgid_plural, msgctxt.
00277          */
00278         $poformat = '".*"\n?(^".*"$\n?)*';
00279 
00280         $matches = array();
00281         if ( preg_match( "/^$name\s($poformat)/mx", $section, $matches ) ) {
00282             return $matches[1];
00283         } else {
00284             return null;
00285         }
00286     }
00287 
00295     public static function generateKeyFromItem( array $item, $algorithm = 'legacy' ) {
00296         $lang = Language::factory( 'en' );
00297 
00298         if ( $item['ctxt'] === '' ) {
00299             /* Messages with msgctxt as empty string should be different
00300              * from messages without any msgctxt. To avoid BC break make
00301              * the empty ctxt a special case */
00302             $hash = sha1( $item['id'] . 'MSGEMPTYCTXT' );
00303         } else {
00304             $hash = sha1( $item['ctxt'] . $item['id'] );
00305         }
00306 
00307         if ( $algorithm === 'simple' ) {
00308             $hash = substr( $hash, 0, 6 );
00309             $snippet = $lang->truncate( $item['id'], 30, '' );
00310             $snippet = str_replace( ' ', '_', trim( $snippet ) );
00311         } else { // legacy
00312             global $wgLegalTitleChars;
00313             $snippet = $item['id'];
00314             $snippet = preg_replace( "/[^$wgLegalTitleChars]/", ' ', $snippet );
00315             $snippet = preg_replace( "/[:&%\/_]/", ' ', $snippet );
00316             $snippet = preg_replace( "/ {2,}/", ' ', $snippet );
00317             $snippet = $lang->truncate( $snippet, 30, '' );
00318             $snippet = str_replace( ' ', '_', trim( $snippet ) );
00319         }
00320 
00321         return "$hash-$snippet";
00322     }
00323 
00334     public static function formatForWiki( $data, $whitespace = 'mark' ) {
00335         $quotePattern = '/(^"|"$\n?)/m';
00336         $data = preg_replace( $quotePattern, '', $data );
00337         $data = stripcslashes( $data );
00338 
00339         if ( preg_match( '/\s$/', $data ) ) {
00340             if ( $whitespace === 'mark' ) {
00341                 $data .= '\\';
00342             } elseif ( $whitespace === 'trim' ) {
00343                 $data = rtrim( $data );
00344             } else {
00345                 // @todo Only triggered if there is trailing whitespace
00346                 throw new MWException( 'Unknown action for whitespace' );
00347             }
00348         }
00349 
00350         return $data;
00351     }
00352 
00353     public static function parseHeaderTags( $headers ) {
00354         $tags = array();
00355         foreach ( explode( "\n", $headers ) as $line ) {
00356             if ( strpos( $line, ':' ) === false ) {
00357                 error_log( __METHOD__ . ": $line" );
00358             }
00359             list( $key, $value ) = explode( ':', $line, 2 );
00360             $tags[trim( $key )] = trim( $value );
00361         }
00362 
00363         return $tags;
00364     }
00365 
00366     protected function writeReal( MessageCollection $collection ) {
00367         $pot = $this->read( 'en' );
00368         $template = $this->read( $collection->code );
00369         $pluralCount = false;
00370         $output = $this->doGettextHeader( $collection, $template, $pluralCount );
00371 
00372         foreach ( $collection as $key => $m ) {
00373             $transTemplate = isset( $template['TEMPLATE'][$key] ) ?
00374                 $template['TEMPLATE'][$key] : array();
00375             $potTemplate = isset( $pot['TEMPLATE'][$key] ) ?
00376                 $pot['TEMPLATE'][$key] : array();
00377 
00378             $output .= $this->formatMessageBlock( $key, $m, $transTemplate, $potTemplate, $pluralCount );
00379         }
00380 
00381         return $output;
00382     }
00383 
00384     protected function doGettextHeader( MessageCollection $collection, $template, &$pluralCount ) {
00385         global $wgSitename;
00386 
00387         $code = $collection->code;
00388         $name = TranslateUtils::getLanguageName( $code );
00389         $native = TranslateUtils::getLanguageName( $code, $code );
00390         $authors = $this->doAuthors( $collection );
00391         if ( isset( $this->extra['header'] ) ) {
00392             $extra = "# --\n" . $this->extra['header'];
00393         } else {
00394             $extra = '';
00395         }
00396 
00397         $output = <<<PHP
00398 # Translation of {$this->group->getLabel()} to $name ($native)
00399 # Exported from $wgSitename
00400 #
00401 $authors$extra
00402 PHP;
00403 
00404         // Make sure there is no empty line before msgid
00405         $output = trim( $output ) . "\n";
00406 
00407         $specs = isset( $template['HEADERS'] ) ? $template['HEADERS'] : array();
00408 
00409         $timestamp = wfTimestampNow();
00410         $specs['PO-Revision-Date'] = self::formatTime( $timestamp );
00411         if ( $this->offlineMode ) {
00412             $specs['POT-Creation-Date'] = self::formatTime( $timestamp );
00413         } elseif ( $this->group instanceof MessageGroupBase ) {
00414             $specs['X-POT-Import-Date'] = self::formatTime( wfTimestamp( TS_MW, $this->getPotTime() ) );
00415         }
00416         $specs['Content-Type'] = 'text/plain; charset=UTF-8';
00417         $specs['Content-Transfer-Encoding'] = '8bit';
00418         $specs['Language'] = wfBCP47( $this->group->mapCode( $code ) );
00419         wfRunHooks( 'Translate:GettextFFS:headerFields', array( &$specs, $this->group, $code ) );
00420         $specs['X-Generator'] = $this->getGenerator();
00421 
00422         if ( $this->offlineMode ) {
00423             $specs['X-Language-Code'] = $code;
00424             $specs['X-Message-Group'] = $this->group->getId();
00425         }
00426 
00427         $plural = self::getPluralRule( $code );
00428         if ( $plural ) {
00429             $specs['Plural-Forms'] = $plural;
00430         } elseif ( !isset( $specs['Plural-Forms'] ) ) {
00431             $specs['Plural-Forms'] = 'nplurals=2; plural=(n != 1);';
00432         }
00433 
00434         $match = array();
00435         preg_match( '/nplurals=(\d+);/', $specs['Plural-Forms'], $match );
00436         $pluralCount = $match[1];
00437 
00438         $output .= 'msgid ""' . "\n";
00439         $output .= 'msgstr ""' . "\n";
00440         $output .= '""' . "\n";
00441 
00442         foreach ( $specs as $k => $v ) {
00443             $output .= self::escape( "$k: $v\n" ) . "\n";
00444         }
00445 
00446         $output .= "\n";
00447 
00448         return $output;
00449     }
00450 
00451     protected function doAuthors( MessageCollection $collection ) {
00452         $output = '';
00453         $authors = $collection->getAuthors();
00454         $authors = $this->filterAuthors( $authors, $collection->code );
00455 
00456         foreach ( $authors as $author ) {
00457             $output .= "# Author: $author\n";
00458         }
00459 
00460         return $output;
00461     }
00462 
00463     protected function formatMessageBlock( $key, $m, $trans, $pot, $pluralCount ) {
00464         $header = $this->formatDocumentation( $key );
00465         $content = '';
00466 
00467         $comments = self::chainGetter( 'comments', $pot, $trans, array() );
00468         foreach ( $comments as $type => $typecomments ) {
00469             foreach ( $typecomments as $comment ) {
00470                 $header .= "#$type $comment\n";
00471             }
00472         }
00473 
00474         $flags = self::chainGetter( 'flags', $pot, $trans, array() );
00475         $flags = array_merge( $m->getTags(), $flags );
00476 
00477         if ( $this->offlineMode ) {
00478             $content .= 'msgctxt ' . self::escape( $key ) . "\n";
00479         } else {
00480             $ctxt = self::chainGetter( 'ctxt', $pot, $trans, false );
00481             if ( $ctxt !== false ) {
00482                 $content .= 'msgctxt ' . self::escape( $ctxt ) . "\n";
00483             }
00484         }
00485 
00486         $msgid = $m->definition();
00487         $msgstr = $m->translation();
00488         if ( strpos( $msgstr, TRANSLATE_FUZZY ) !== false ) {
00489             $msgstr = str_replace( TRANSLATE_FUZZY, '', $msgstr );
00490             // Might by fuzzy infile
00491             $flags[] = 'fuzzy';
00492         }
00493 
00494         if ( preg_match( '/{{PLURAL:GETTEXT/i', $msgid ) ) {
00495             $forms = $this->splitPlural( $msgid, 2 );
00496             $content .= 'msgid ' . $this->escape( $forms[0] ) . "\n";
00497             $content .= 'msgid_plural ' . $this->escape( $forms[1] ) . "\n";
00498 
00499             try {
00500                 $forms = $this->splitPlural( $msgstr, $pluralCount );
00501                 foreach ( $forms as $index => $form ) {
00502                     $content .= "msgstr[$index] " . $this->escape( $form ) . "\n";
00503                 }
00504             } catch ( GettextPluralException $e ) {
00505                 $flags[] = 'invalid-plural';
00506                 for ( $i = 0; $i < $pluralCount; $i++ ) {
00507                     $content .= "msgstr[$i] \"\"\n";
00508                 }
00509             }
00510         } else {
00511             $content .= 'msgid ' . self::escape( $msgid ) . "\n";
00512             $content .= 'msgstr ' . self::escape( $msgstr ) . "\n";
00513         }
00514 
00515         if ( $flags ) {
00516             sort( $flags );
00517             $header .= "#, " . implode( ', ', array_unique( $flags ) ) . "\n";
00518         }
00519 
00520         $output = $header ? $header : "#\n";
00521         $output .= $content . "\n";
00522 
00523         return $output;
00524     }
00525 
00526     protected static function chainGetter( $key, $a, $b, $default ) {
00527         if ( isset( $a[$key] ) ) {
00528             return $a[$key];
00529         } elseif ( isset( $b[$key] ) ) {
00530             return $b[$key];
00531         } else {
00532             return $default;
00533         }
00534     }
00535 
00536     protected static function formatTime( $time ) {
00537         $lang = Language::factory( 'en' );
00538 
00539         return $lang->sprintfDate( 'xnY-xnm-xnd xnH:xni:xns+0000', $time );
00540     }
00541 
00542     protected function getPotTime() {
00543         $defs = new MessageGroupCache( $this->group );
00544 
00545         return $defs->exists() ? $defs->getTimestamp() : wfTimestampNow();
00546     }
00547 
00548     protected function getGenerator() {
00549         return 'MediaWiki ' . SpecialVersion::getVersion() .
00550             "; Translate " . TRANSLATE_VERSION;
00551     }
00552 
00553     protected function formatDocumentation( $key ) {
00554         global $wgTranslateDocumentationLanguageCode;
00555 
00556         if ( !$this->offlineMode ) {
00557             return '';
00558         }
00559 
00560         $code = $wgTranslateDocumentationLanguageCode;
00561         if ( !$code ) {
00562             return '';
00563         }
00564 
00565         $documentation = TranslateUtils::getMessageContent( $key, $code, $this->group->getNamespace() );
00566         if ( !is_string( $documentation ) ) {
00567             return '';
00568         }
00569 
00570         $lines = explode( "\n", $documentation );
00571         $out = '';
00572         foreach ( $lines as $line ) {
00573             $out .= "#. [Wiki] $line\n";
00574         }
00575 
00576         return $out;
00577     }
00578 
00579     protected static function escape( $line ) {
00580         // There may be \ as a last character, for keeping trailing whitespace
00581         $line = preg_replace( '/(\s)\\\\$/', '\1', $line );
00582         $line = addcslashes( $line, '\\"' );
00583         $line = str_replace( "\n", '\n', $line );
00584         $line = '"' . $line . '"';
00585 
00586         return $line;
00587     }
00588 
00594     public static function getPluralRule( $code ) {
00595         $rulefile = __DIR__ . '/../data/plural-gettext.txt';
00596         $rules = file_get_contents( $rulefile );
00597         foreach ( explode( "\n", $rules ) as $line ) {
00598             if ( trim( $line ) === '' ) {
00599                 continue;
00600             }
00601             list( $rulecode, $rule ) = explode( "\t", $line );
00602             if ( $rulecode === $code ) {
00603                 return $rule;
00604             }
00605         }
00606 
00607         return '';
00608     }
00609 
00610     protected function splitPlural( $text, $forms ) {
00611         if ( $forms === 1 ) {
00612             return $text;
00613         }
00614 
00615         $placeholder = TranslateUtils::getPlaceholder();
00616         # |/| is commonly used in KDE to support inflections
00617         $text = str_replace( '|/|', $placeholder, $text );
00618 
00619         $plurals = array();
00620         $match = preg_match_all( '/{{PLURAL:GETTEXT\|(.*)}}/iUs', $text, $plurals );
00621         if ( !$match ) {
00622             throw new GettextPluralException( "Failed to find plural in: $text" );
00623         }
00624 
00625         $splitPlurals = array();
00626         for ( $i = 0; $i < $forms; $i++ ) {
00627             # Start with the hole string
00628             $pluralForm = $text;
00629             # Loop over *each* {{PLURAL}} instance and replace
00630             # it with the plural form belonging to this index
00631             foreach ( $plurals[0] as $index => $definition ) {
00632                 $parsedFormsArray = explode( '|', $plurals[1][$index] );
00633                 if ( !isset( $parsedFormsArray[$i] ) ) {
00634                     error_log( "Too few plural forms in: $text" );
00635                     $pluralForm = '';
00636                 } else {
00637                     $pluralForm = str_replace( $pluralForm, $definition, $parsedFormsArray[$i] );
00638                 }
00639             }
00640 
00641             $pluralForm = str_replace( $placeholder, '|/|', $pluralForm );
00642             $splitPlurals[$i] = $pluralForm;
00643         }
00644 
00645         return $splitPlurals;
00646     }
00647 }
Generated on Tue Oct 29 00:00:23 2013 for MediaWiki Translate Extension by  doxygen 1.6.3