SolrTTMServer.php

Go to the documentation of this file.
00001 <?php
00017 class SolrTTMServer extends TTMServer implements ReadableTTMServer, WritableTTMServer {
00022     const COMMIT_WITHIN = 5000;
00023 
00024     protected $client;
00025     protected $updates;
00026     protected $revIds;
00027 
00028     public function __construct( $config ) {
00029         wfProfileIn( __METHOD__ );
00030         parent::__construct( $config );
00031         if ( isset( $config['config'] ) ) {
00032             $this->client = new Solarium_Client( $config['config'] );
00033         } else {
00034             $this->client = new Solarium_Client();
00035         }
00036         wfProfileOut( __METHOD__ );
00037     }
00038 
00039     public function isLocalSuggestion( array $suggestion ) {
00040         return $suggestion['wiki'] === wfWikiId();
00041     }
00042 
00043     public function expandLocation( array $suggestion ) {
00044         return $suggestion['uri'];
00045     }
00046 
00047     public function query( $sourceLanguage, $targetLanguage, $text ) {
00048         try {
00049             return $this->doQuery( $sourceLanguage, $targetLanguage, $text );
00050         } catch ( Solarium_Exception $e ) {
00051             throw new TranslationHelperException( 'Solarium exception: ' . $e );
00052         }
00053     }
00054 
00056     protected function doQuery( $sourceLanguage, $targetLanguage, $text ) {
00057         /* Two query system:
00058          * 1) Find all strings in source language that match text
00059          * 2) Do another query for translations for those strings
00060          */
00061         wfProfileIn( __METHOD__ );
00062         // For now impose a length limit on query string to avoid doing
00063         // very slow queries. Magic number.
00064         if ( strlen( $text ) > 789 ) {
00065             return array();
00066         }
00067 
00068         $query = $this->client->createSelect();
00069         $query->setFields( array( 'globalid', 'content', 'score' ) );
00070 
00071         /* The interface usually displays three best candidates. These might
00072          * come from more than three matches, if the translation is the same.
00073          * This might not find all suggestions, if the top N best matching
00074          * source texts don't have translations, but worse matches do. We
00075          * could loop with start parameter to fetch more until we have enough
00076          * suggestions or the quality drops below the cutoff point. */
00077         $query->setRows( 25 );
00078 
00079         /* Our string can contain all kind of nasty characters, so we need
00080          * escape them with great pain. */
00081         $helper = $query->getHelper();
00082         $dist = $helper->escapePhrase( $text );
00083         // "edit" could also be ngram of other algorithm
00084         $dist = "strdist($dist,content,edit)";
00085         /* Note how we need to escape twice here, first the string for strdist
00086          * and then the strdist call itself for the query. And of course every-
00087          * thing will be URL encoded once sent over the line. */
00088         $query->setQuery( '_val_:%P1%', array( $dist ) );
00089 
00090         /* Filter queries are supposed to be efficient as they are separately
00091          * cached, but I haven't done any benchmarks. */
00092         $query->createFilterQuery( 'lang' )
00093             ->setQuery( 'language:%P1%', array( $sourceLanguage ) );
00094 
00095         $resultset = $this->client->select( $query );
00096 
00097         /* This query is doing two unrelated things:
00098          * 1) Collect the message contents and scores so that they can
00099          *    be accessed later for the translations we found.
00100          * 2) Build the query string for the query that fetches the
00101          *    translations.
00102          * This code is a bit uglier than I'd like it to be, since there
00103          * there is no field that globally identifies a message (message
00104          * definition and translations). */
00105         $contents = $scores = array();
00106         $queryString = '';
00107         foreach ( $resultset as $doc ) {
00108             $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid );
00109             $contents[$sourceId] = $doc->content;
00110             $scores[$sourceId] = $doc->score;
00111 
00112             $globalid = $helper->escapePhrase( "$sourceId/$targetLanguage" );
00113             $queryString .= "globalid:$globalid ";
00114         }
00115 
00116         // Second query to fetch available translations
00117         $fetchQuery = $this->client->createSelect();
00118         $fetchQuery->setFields( array( 'wiki', 'uri', 'content', 'messageid', 'globalid' ) );
00119         // This come in random order, so have to fetch all and sort
00120         $fetchQuery->setRows( 25 );
00121         $fetchQuery->setQuery( $queryString );
00122         // With AND we would not find anything, obviously.
00123         $fetchQuery->setQueryDefaultOperator( Solarium_Query_Select::QUERY_OPERATOR_OR );
00124 
00125         $translations = $this->client->select( $fetchQuery );
00126 
00127         $suggestions = array();
00128         foreach ( $translations as $doc ) {
00129             /* Construct the matching source id */
00130             $sourceId = preg_replace( '~/[^/]+$~', '', $doc->globalid );
00131 
00132             /* Unfortunately we cannot do this on the search server,
00133              * because score is not a real field and thus cannot be
00134              * used in a filter query. */
00135             $quality = $scores[$sourceId];
00136             if ( $quality < $this->config['cutoff'] ) {
00137                 continue;
00138             }
00139 
00140             $suggestions[] = array(
00141                 'source' => $contents[$sourceId],
00142                 'target' => $doc->content,
00143                 'context' => $doc->messageid,
00144                 'quality' => $quality,
00145                 'wiki' => $doc->wiki,
00146                 'location' => $doc->messageid . '/' . $targetLanguage,
00147                 'uri' => $doc->uri,
00148             );
00149         }
00150 
00151         /* Like mentioned above, we get results in random order. Sort them
00152          * now to have best matches first as expected by callers. */
00153         uasort( $suggestions, function ( $a, $b ) {
00154             if ( $a['quality'] === $b['quality'] ) {
00155                 return 0;
00156             }
00157 
00158             return ( $a['quality'] < $b['quality'] ) ? 1 : -1;
00159         } );
00160 
00161         wfProfileOut( __METHOD__ );
00162 
00163         return $suggestions;
00164     }
00165 
00166     /* Write functions */
00167 
00168     public function update( MessageHandle $handle, $targetText ) {
00169         if ( $handle->getCode() === '' ) {
00170             return false;
00171         }
00172         wfProfileIn( __METHOD__ );
00173 
00174         /* There are various different cases here:
00175          * [new or updated] [fuzzy|non-fuzzy] [translation|definition]
00176          * 1) We don't distinguish between new or updated here.
00177          * 2) Delete old translation, but not definition
00178          * 3) Insert new translation or definition, if non-fuzzy
00179          * The definition should never be fuzzied anyway.
00180          *
00181          * These only apply to known messages.
00182          */
00183 
00184         $update = $this->client->createUpdate();
00185         $title = $handle->getTitle();
00186 
00187         $doDelete = true;
00188         $sourceLanguage = '';
00189         if ( $handle->isValid() ) {
00190             $sourceLanguage = $handle->getGroup()->getSourceLanguage();
00191             if ( $handle->getCode() === $sourceLanguage ) {
00192                 $doDelete = false;
00193             }
00194         }
00195 
00196         if ( $doDelete ) {
00197             $base = Title::makeTitle( $title->getNamespace(), $handle->getKey() );
00198             $conds = array(
00199                 'wiki' => wfWikiId(),
00200                 'language' => $handle->getCode(),
00201                 'messageid' => $base->getPrefixedText(),
00202             );
00203             foreach ( $conds as $key => &$value ) {
00204                 $value = "$key:" . $update->getHelper()->escapePhrase( $value );
00205             }
00206             $update->addDeleteQuery( implode( ' AND ', $conds ) );
00207         }
00208 
00209         if ( $targetText !== null ) {
00210             if ( $handle->isValid() ) {
00211                 // Of the message definition page
00212                 $targetTitle = $handle->getTitle();
00213                 $sourceTitle = Title::makeTitle(
00214                     $targetTitle->getNamespace(),
00215                     $handle->getKey() . '/' . $sourceLanguage
00216                 );
00217                 $revId = intval( $sourceTitle->getLatestRevID() );
00218                 /* Note: in some cases the source page might not exist, in this case
00219                  * we use 0 as message version identifier, to differentiate them from
00220                  * orphan messages */
00221             } else {
00222                 $revId = 'orphan';
00223             }
00224 
00225             $doc = $this->createDocument( $handle, $targetText, $revId );
00226             // Add document and commit within X seconds.
00227             $update->addDocument( $doc, null, self::COMMIT_WITHIN );
00228         }
00229 
00230         try {
00231             $this->client->update( $update );
00232         } catch ( Solarium_Exception $e ) {
00233             error_log( "SolrTTMServer update-write failed" );
00234             wfProfileOut( __METHOD__ );
00235 
00236             return false;
00237         }
00238 
00239         wfProfileOut( __METHOD__ );
00240 
00241         return true;
00242     }
00243 
00247     protected function createDocument( MessageHandle $handle, $text, $revId ) {
00248         $language = $handle->getCode();
00249         $translationTitle = $handle->getTitle();
00250 
00251         $title = Title::makeTitle( $handle->getTitle()->getNamespace(), $handle->getKey() );
00252         $wiki = wfWikiId();
00253         $messageid = $title->getPrefixedText();
00254         $globalid = "$wiki-$messageid-$revId/$language";
00255 
00256         $doc = new Solarium_Document_ReadWrite();
00257         $doc->wiki = $wiki;
00258         $doc->uri = $translationTitle->getCanonicalUrl();
00259         $doc->messageid = $messageid;
00260         $doc->globalid = $globalid;
00261 
00262         $doc->language = $language;
00263         $doc->content = $text;
00264         $doc->setField( 'group', $handle->getGroupIds() );
00265 
00266         return $doc;
00267     }
00268 
00269     public function beginBootstrap() {
00270         $update = $this->client->createUpdate();
00271         $query = 'wiki:' . $update->getHelper()->escapePhrase( wfWikiId() );
00272         $update->addDeleteQuery( $query );
00273         $update->addCommit();
00274         $this->client->update( $update );
00275     }
00276 
00277     public function beginBatch() {
00278         $this->revIds = array();
00279     }
00280 
00281     public function batchInsertDefinitions( array $batch ) {
00282         $lb = new LinkBatch();
00283         foreach ( $batch as $data ) {
00284             $lb->addObj( $data[0] );
00285         }
00286         $lb->execute();
00287 
00288         foreach ( $batch as $key => $data ) {
00289             $this->revIds[$key] = $data[0]->getLatestRevID();
00290         }
00291 
00292         $this->batchInsertTranslations( $batch );
00293     }
00294 
00295     public function batchInsertTranslations( array $batch ) {
00296         $update = $this->client->createUpdate();
00297         foreach ( $batch as $key => $data ) {
00298             list( $title, , $text ) = $data;
00299             $handle = new MessageHandle( $title );
00300             $doc = $this->createDocument( $handle, $text, $this->revIds[$key] );
00301             // Add document and commit within X seconds.
00302             $update->addDocument( $doc, null, self::COMMIT_WITHIN );
00303         }
00304         $this->client->update( $update );
00305     }
00306 
00307     public function endBatch() {
00308         $update = $this->client->createUpdate();
00309         $this->client->update( $update );
00310     }
00311 
00312     public function endBootstrap() {
00313         $update = $this->client->createUpdate();
00314         $update->addCommit();
00315         $update->addOptimize();
00316         $this->client->update( $update );
00317     }
00318 
00319     public function getSolarium() {
00320         return $this->client;
00321     }
00322 }
Generated on Tue Oct 29 00:00:25 2013 for MediaWiki Translate Extension by  doxygen 1.6.3