http://www.mediawiki.org/wiki/Special:Code/MediaWiki/68832

Revision: 68832
Author:   daniel
Date:     2010-07-01 14:50:27 +0000 (Thu, 01 Jul 2010)

Log Message:
-----------
better indexes for faster access

Modified Paths:
--------------
    trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh
    trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql
    trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php
    trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php
    trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php
    trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php

Added Paths:
-----------
    trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql

Removed Paths:
-------------
    trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql

Modified: trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh
===================================================================
--- trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh       
2010-07-01 13:43:05 UTC (rev 68831)
+++ trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh       
2010-07-01 14:50:27 UTC (rev 68832)
@@ -4,12 +4,9 @@
 db="$1"
 collection="$2"
 thesaurus="$3"
-languages="en de fr nl it es pt pl"
 
-echo "preparing search index"
+echo "building search index"
 replace  '{collection}' "$collection" '{thesaurus}' "$thesaurus"  < 
search-index.sql | mysql "$db"
 
-for n in $languages; do
-    echo "collection search index: $n"
-    replace  '{collection}' "$collection" '{thesaurus}' "$thesaurus" '{lang}' 
"$n" < search-index-local.sql | mysql "$db"
-done
+echo "building resource index"
+replace  '{collection}' "$collection" '{thesaurus}' "$thesaurus"  < 
resource-index.sql | mysql "$db"

Added: trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql
===================================================================
--- trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql          
                (rev 0)
+++ trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql  
2010-07-01 14:50:27 UTC (rev 68832)
@@ -0,0 +1,14 @@
+create table if not exists {collection}_{thesaurus}_resource_index (
+     concept int(11) NOT NULL,
+     resources MEDIUMBLOB int(11) NOT NULL,
+     PRIMARY KEY ( concept ),
+     ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
+
+truncate {collection}_{thesaurus}_resource_index;
+
+-- collect ressources in all languages
+insert into {collection}_{thesaurus}_resource_index ( 
+     concept, resources
+select concept, group_concat(distinct concat(type, ":", lang, ":", 
local_resource_name) separator "|" ) as resources 
+from {collection}_{thesaurus}_about as A
+where type > 0;

Deleted: trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql
===================================================================
--- trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql      
2010-07-01 13:43:05 UTC (rev 68831)
+++ trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql      
2010-07-01 14:50:27 UTC (rev 68832)
@@ -1,16 +0,0 @@
--- collect definitions
-insert into {collection}_{thesaurus}_search_index ( 
-     concept, concept_name, `type`,
-     `lang`, `term`, `score`, `norm` )
-select O.global_concept, M.concept_name, C.type, "{lang}", 
-      REPLACE( LCASE( CAST(M.term_text as CHAR CHARACTER SET utf8) COLLATE 
utf8_general_ci ), "-", "" ), 
-      M.rule * M.freq, 1
-from {collection}_{lang}_meaning as M
-join {collection}_{thesaurus}_origin as O on O.lang = "{lang}" and 
O.local_concept = M.concept
-join {collection}_{thesaurus}_concept as C on C.id = O.global_concept
-where (M.rule not in (10, 30) OR M.freq > 1) and C.type > 0
-on duplicate key update 
-  score = if (score > values(score), score, values(score)),
-  norm = if (norm < values(norm), score, values(norm));
-
--- FIXME: normalization levels! 0=none, 1=case-and-dash (+translit?), 
2=whitespace-and-punctuation, 4=soundex
\ No newline at end of file

Modified: trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql
===================================================================
--- trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql    
2010-07-01 13:43:05 UTC (rev 68831)
+++ trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql    
2010-07-01 14:50:27 UTC (rev 68832)
@@ -11,3 +11,19 @@
      ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
 
 truncate {collection}_{thesaurus}_search_index;
+
+-- collect definitions
+insert into {collection}_{thesaurus}_search_index ( 
+     concept, concept_name, `type`, `lang`, `term`, `score`, `norm` )
+select C.id, O.local_concept_name, C.type, M.lang,  
+      REPLACE( LCASE( CAST(M.term_text as CHAR CHARACTER SET utf8) COLLATE 
utf8_general_ci ), "-", "" ), 
+      M.rule * M.freq, 1
+from {collection}_{thesaurus}_meaning as M
+join {collection}_{thesaurus}_concept as C on C.id = M.concept
+join {collection}_{thesaurus}_origin as O on O.global_concept = M.concept and 
O.lang = M.lang -- FIXME: remove this once the global menaing table contains 
the local concept name
+where (M.rule not in (10, 30) OR M.freq > 1) and C.type > 0
+on duplicate key update 
+  score = if (score > values(score), score, values(score)),
+  norm = if (norm < values(norm), score, values(norm));
+
+-- FIXME: normalization levels! 0=none, 1=case-and-dash (+translit?), 
2=whitespace-and-punctuation, 4=soundex
\ No newline at end of file

Modified: trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php
===================================================================
--- trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php 2010-07-01 
13:43:05 UTC (rev 68831)
+++ trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php 2010-07-01 
14:50:27 UTC (rev 68832)
@@ -94,11 +94,13 @@
        return $p['scores'];
     }*/
 
-    function getConceptInfo( $id, $lang = null ) {
+    function getConceptInfo( $id, $lang = null, $fields = null, $rclang = null 
) {
        $param = array(
                'query' => 'info',
                'gcid' => $id,
-               'lang' => $lang
+               'lang' => $lang,
+               'fields' => fields,
+               'rclang' => $rclang
        );
 
        $rs = $this->query( $param );
@@ -132,12 +134,13 @@
        return $rs;
     }*/
 
-    function getConceptsForTerm( $qlang, $term, $languages, $norm = 1, $limit 
= 100 ) {
+    function getConceptsForTerm( $qlang, $term, $languages, $norm = 1, $rclang 
= null, $limit = 100 ) {
        if ( is_array( $languages ) ) $languages = implode('|', $languages);
 
        $param = array(
                'query' => 'concepts',
                'qlang' => $qlang,
+               'rclang' => $rclang,
                'lang' => $languages,
                'norm' => $norm,
                'term' => $term,

Modified: trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php
===================================================================
--- trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php      
2010-07-01 13:43:05 UTC (rev 68831)
+++ trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php      
2010-07-01 14:50:27 UTC (rev 68832)
@@ -72,20 +72,26 @@
        return $s;
     }
 
-    function queryConceptsForTerm($qlang, $term, $languages, $norm = 1, $limit 
= 100) {
+    function queryConceptsForTerm($qlang, $term, $languages, $norm = 1, 
$rclang = null, $limit = 100) {
        global $wwTablePrefix, $wwThesaurusDataset, $wwLanguages;
 
        if ( !$languages ) $languages = array_keys( $wwLanguages );
 
        $term = $this->normalizeSearchString($term, $norm);
 
-       $sql = "SELECT I.*, S.score FROM 
{$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I"
-             . " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_search_index as S 
ON I.concept = S.concept "
-             . " WHERE term = " . $this->quote($term) 
+       $sql = "SELECT I.*, S.score ";
+       if ( $rclang ) $sql .= ", R.resources "
+
+       $sql .= " FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I 
";
+       $sql .= " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_search_index as S 
ON I.concept = S.concept ";
+       if ( $rclang ) $sql .= " JOIN 
{$wwTablePrefix}_{$wwThesaurusDataset}_resource_index as R ON R.concept = 
I.concept "
+
+       $sql .= " WHERE term = " . $this->quote($term) 
              . " AND I.lang IN " . $this->quoteSet($languages) 
              . " AND S.lang = " . $this->quote($qlang) 
-             . " AND S.norm <= " . (int)$norm
-             . " ORDER BY S.score DESC, S.concept "
+             . " AND S.norm <= " . (int)$norm;
+
+       $sql .= " ORDER BY S.score DESC, S.concept "
              . " LIMIT " . (int)$limit;
 
        #FIXME: query-lang vs. output-languages!
@@ -93,8 +99,8 @@
        return $this->query($sql);
     }
 
-    function getConceptsForTerm($qlang, $term, $languages, $norm = 1, $limit = 
100) {
-       $rs = $this->queryConceptsForTerm($qlang, $term, $languages);
+    function getConceptsForTerm($qlang, $term, $languages, $norm = 1, $rclang 
= null, $limit = 100) {
+       $rs = $this->queryConceptsForTerm($qlang, $term, $languages, $norm, 
$rclang, $limit);
        $list = WWUtils::slurpRows($rs);
        mysql_free_result($rs);
        return $this->buildConcepts($list);
@@ -243,6 +249,23 @@
     }
     */
 
+    function spliceResources( $rc, &$into ) {
+       if (!$rc) return;
+
+       if (is_string($rc)) {
+               $rr = explode("|", $rc);
+
+               $rc = array();
+               foreach ($rr as $r) {
+                   list($t, $lang, $n) = explode(":", $p, 3);
+                   $rc[$lang][$n] = (int)$t;
+               }
+       }
+
+       if (!$into) $into = $rc;
+       else $into = array_merge( $into, $rc );
+    }
+
     function splitPages( $s ) {
        $pp = explode("|", $s);
 
@@ -269,7 +292,7 @@
     }
 
     /////////////////////////////////////////////////////////
-    function getConceptInfo( $id, $lang = null, $fields = null ) {
+    function getConceptInfo( $id, $lang = null, $fields = null, $rclang = null 
) {
        global $wwTablePrefix, $wwThesaurusDataset;
 
        #TODO: concept cache!
@@ -279,9 +302,14 @@
 
        #TODO: scores, concept-type, ...
 
-       $sql = "SELECT $fields FROM 
{$wwTablePrefix}_{$wwThesaurusDataset}_concept_info "
-           . " WHERE concept = ".(int)$id;
+       $sql = "SELECT $fields ";
+       if ( $rclang ) $sql .= ", R.resources ";
+         
+       $sql .= " FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I 
";
+       if ( $rclang ) $sql .= " JOIN 
{$wwTablePrefix}_{$wwThesaurusDataset}_resource_index as R ON R.concept = 
I.concept ";
 
+       $sql .= " WHERE concept = ".(int)$id;
+
        if ($lang) {
            if ( is_array($lang) ) $sql .= " AND lang IN " . 
$this->quoteSet($lang);
            else $sql .= " AND lang = " . $this->quote($lang);
@@ -352,8 +380,10 @@
 
            if (@$row["name"] !== null) $concept["name"][$lang] = $row["name"];
            if (@$row["definition"] !== null) $concept["definition"][$lang] = 
$row["definition"];
-           if (@$row["pages"] !== null) $concept["pages"][$lang] = 
$this->splitPages($row["pages"]);
 
+           if (@$row["resources"])  $this->spliceResources($row["resources"], 
&$concept["pages"]) );
+           else if (@$row["pages"]) $concept["pages"][$lang] = 
$this->splitPages($row["pages"]);
+
            if (@$row["broader"] !== null)  $broader[$lang] =  
$this->splitConcepts($row["broader"]);
            if (@$row["narrower"] !== null) $narrower[$lang] = 
$this->splitConcepts($row["narrower"]);
            if (@$row["similar"] !== null)  $similar[$lang] =  
$this->splitConcepts($row["similar"]);

Modified: trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php
===================================================================
--- trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php 2010-07-01 
13:43:05 UTC (rev 68831)
+++ trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php 2010-07-01 
14:50:27 UTC (rev 68832)
@@ -200,7 +200,7 @@
 $mode = NULL;
 $result = NULL;
 
-$fallback_languages = array( "en", "commons" ); #TODO: make the user define 
this list
+$fallback_languages = array( "en" ); #TODO: make the user define this list
 
 if ( $lang ) {
     $languages = explode( '|', $lang );
@@ -222,11 +222,11 @@
   try {
       if ($lang && $conceptId) {
          $mode = "concept";
-         $result = $thesaurus->getConceptInfo($conceptId, $allLanguages);
+         $result = $thesaurus->getConceptInfo($conceptId, $lang, null, 
$allLanguages);
          if ( $result ) $result = array( $result ); //hack
       } else if ($lang && $term) {
                  $mode = "term";
-                 $result = $thesaurus->getConceptsForTerm($lang, $term, 
$allLanguages, $norm, $limit);
+                 $result = $thesaurus->getConceptsForTerm($lang, $term, 
$languages, $norm, $allLanguages, $limit);
       } 
   } catch (Exception $e) {
       $error = $e->getMessage();
@@ -237,7 +237,8 @@
 if (!isset($scriptPath)) $scriptPath = "./";
 if (!isset($skinPath)) $skinPath = "$scriptPath/../skin/";
 
-if ( $format == "atom" || $format == "xml" || $format == "opensearch" ) 
include("response.atom.php"); 
-else include("response.html.php"); 
+/*if ( $format == "atom" || $format == "xml" || $format == "opensearch" ) 
include("response.atom.php"); 
+else*/ 
+include("response.html.php"); 
 
 $utils->close();

Modified: trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php
===================================================================
--- trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php    2010-07-01 
13:43:05 UTC (rev 68831)
+++ trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php    2010-07-01 
14:50:27 UTC (rev 68832)
@@ -10,16 +10,19 @@
 if ( $query ) {
     $lang = @$_REQUEST['lang'];
     $qlang = @$_REQUEST['qlang'];
+    $rclang = @$_REQUEST['rclang'];
     $format = @$_REQUEST['format'];
     if ( !$format ) $format = 'phps';
 
     if ($lang) $lang = preg_replace('[^\\w\\d_]', '', $lang);
     if ($qlang) $qlang = preg_replace('[^\\w\\d_]', '', $qlang);
+    
+    if ($rclang) $rclang = preg_replace('[^\\w\\d_]', '', $rclang);
 
     if ($lang) {
-       $lang = preg_split('![\\s,;|/:]\\s*!', $lang);
-       if ( !$qlang ) $qlang = $lang[0];
-       if (count($lang) == 1) $lang = $lang[0];
+               $lang = preg_split('![\\s,;|/:]\\s*!', $lang);
+               if ( !$qlang ) $qlang = $lang[0];
+               if (count($lang) == 1) $lang = $lang[0];
     }
 
     $result = array( 'query' => $query );
@@ -40,29 +43,30 @@
 
            if ( $qlang === null ) $result['error'] = array('code' => 150, 
'message' => "missing parameter qlang");
            else if ( $term !== null ) {
-               $result['concepts'] = $thesaurus->getConceptsForTerm($qlang, 
$term, $lang, $norm); #TODO: limit!
-               if ( $result['concepts'] === false || $result['concepts'] === 
null ) {
-                   $result['error'] = array('code' => 210, 'message' => 
"failed to retrieve concepts for term $langt:$term");
-               }
+                       $result['concepts'] = 
$thesaurus->getConceptsForTerm($qlang, $term, $lang, $norm, $rclang); #TODO: 
limit!
+                       if ( $result['concepts'] === false || 
$result['concepts'] === null ) {
+                           $result['error'] = array('code' => 210, 'message' 
=> "failed to retrieve concepts for term $langt:$term");
+                       }
            } /*else if ( $page !== null ) {
                $result['concepts'] = $thesaurus->getConceptsForPage($lang, 
$page);
                if ( $result['concepts'] === false || $result['concepts'] === 
null ) {
                    $result['error'] = array('code' => 250, 'message' => 
"failed to retrieve concepts for page $langt:$page");
                }
            } */else {
-               $result['error'] = array('code' => 110, 'message' => "missing 
parameter term");
+                       $result['error'] = array('code' => 110, 'message' => 
"missing parameter term");
            }
        } else if ($query == 'concept' || $query == 'info') {
            $gcid = @$_REQUEST['gcid'];
            if (!$gcid) $gcid = @$_REQUEST['id'];
 
            if ( $gcid === null ) {
-               $result['error'] = array('code' => 120, 'message' => "missing 
parameter gcid");
+                       $result['error'] = array('code' => 120, 'message' => 
"missing parameter gcid");
            } else {
-               $result['concept'] = $thesaurus->getConceptInfo($gcid, $lang); 
#TODO: limit!
-               if ( $result['concept'] === false || $result['concept'] === 
null ) {
-                   $result['error'] = array('code' => 210, 'message' => 
"concept not found: $gcid");
-               }
+                   $fields = null;
+                       $result['concept'] = $thesaurus->getConceptInfo($gcid, 
$lang, $fields, $rclang); #TODO: limit!
+                       if ( $result['concept'] === false || $result['concept'] 
=== null ) {
+                           $result['error'] = array('code' => 210, 'message' 
=> "concept not found: $gcid");
+                       }
            }
        } else if ($query == 'properties') {
            $gcid = @$_REQUEST['gcid'];



_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to