http://www.mediawiki.org/wiki/Special:Code/MediaWiki/68832
Revision: 68832 Author: daniel Date: 2010-07-01 14:50:27 +0000 (Thu, 01 Jul 2010) Log Message: ----------- better indexes for faster access Modified Paths: -------------- trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php Added Paths: ----------- trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql Removed Paths: ------------- trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql Modified: trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh =================================================================== --- trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh 2010-07-01 13:43:05 UTC (rev 68831) +++ trunk/WikiWord/WikiWordWeb/src/main/maintenance/build-search-index.sh 2010-07-01 14:50:27 UTC (rev 68832) @@ -4,12 +4,9 @@ db="$1" collection="$2" thesaurus="$3" -languages="en de fr nl it es pt pl" -echo "preparing search index" +echo "building search index" replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" < search-index.sql | mysql "$db" -for n in $languages; do - echo "collection search index: $n" - replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" '{lang}' "$n" < search-index-local.sql | mysql "$db" -done +echo "building resource index" +replace '{collection}' "$collection" '{thesaurus}' "$thesaurus" < resource-index.sql | mysql "$db" Added: trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql =================================================================== --- trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql (rev 0) +++ trunk/WikiWord/WikiWordWeb/src/main/maintenance/resource-index.sql 2010-07-01 14:50:27 UTC (rev 68832) @@ -0,0 +1,14 @@ +create table if not exists {collection}_{thesaurus}_resource_index ( + concept int(11) NOT NULL, + resources MEDIUMBLOB int(11) NOT NULL, + PRIMARY KEY ( concept ), + ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin; + +truncate {collection}_{thesaurus}_resource_index; + +-- collect ressources in all languages +insert into {collection}_{thesaurus}_resource_index ( + concept, resources +select concept, group_concat(distinct concat(type, ":", lang, ":", local_resource_name) separator "|" ) as resources +from {collection}_{thesaurus}_about as A +where type > 0; Deleted: trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql =================================================================== --- trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql 2010-07-01 13:43:05 UTC (rev 68831) +++ trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index-local.sql 2010-07-01 14:50:27 UTC (rev 68832) @@ -1,16 +0,0 @@ --- collect definitions -insert into {collection}_{thesaurus}_search_index ( - concept, concept_name, `type`, - `lang`, `term`, `score`, `norm` ) -select O.global_concept, M.concept_name, C.type, "{lang}", - REPLACE( LCASE( CAST(M.term_text as CHAR CHARACTER SET utf8) COLLATE utf8_general_ci ), "-", "" ), - M.rule * M.freq, 1 -from {collection}_{lang}_meaning as M -join {collection}_{thesaurus}_origin as O on O.lang = "{lang}" and O.local_concept = M.concept -join {collection}_{thesaurus}_concept as C on C.id = O.global_concept -where (M.rule not in (10, 30) OR M.freq > 1) and C.type > 0 -on duplicate key update - score = if (score > values(score), score, values(score)), - norm = if (norm < values(norm), score, values(norm)); - --- FIXME: normalization levels! 0=none, 1=case-and-dash (+translit?), 2=whitespace-and-punctuation, 4=soundex \ No newline at end of file Modified: trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql =================================================================== --- trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql 2010-07-01 13:43:05 UTC (rev 68831) +++ trunk/WikiWord/WikiWordWeb/src/main/maintenance/search-index.sql 2010-07-01 14:50:27 UTC (rev 68832) @@ -11,3 +11,19 @@ ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin; truncate {collection}_{thesaurus}_search_index; + +-- collect definitions +insert into {collection}_{thesaurus}_search_index ( + concept, concept_name, `type`, `lang`, `term`, `score`, `norm` ) +select C.id, O.local_concept_name, C.type, M.lang, + REPLACE( LCASE( CAST(M.term_text as CHAR CHARACTER SET utf8) COLLATE utf8_general_ci ), "-", "" ), + M.rule * M.freq, 1 +from {collection}_{thesaurus}_meaning as M +join {collection}_{thesaurus}_concept as C on C.id = M.concept +join {collection}_{thesaurus}_origin as O on O.global_concept = M.concept and O.lang = M.lang -- FIXME: remove this once the global menaing table contains the local concept name +where (M.rule not in (10, 30) OR M.freq > 1) and C.type > 0 +on duplicate key update + score = if (score > values(score), score, values(score)), + norm = if (norm < values(norm), score, values(norm)); + +-- FIXME: normalization levels! 0=none, 1=case-and-dash (+translit?), 2=whitespace-and-punctuation, 4=soundex \ No newline at end of file Modified: trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php =================================================================== --- trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php 2010-07-01 13:43:05 UTC (rev 68831) +++ trunk/WikiWord/WikiWordWeb/src/main/www/common/wwclient.php 2010-07-01 14:50:27 UTC (rev 68832) @@ -94,11 +94,13 @@ return $p['scores']; }*/ - function getConceptInfo( $id, $lang = null ) { + function getConceptInfo( $id, $lang = null, $fields = null, $rclang = null ) { $param = array( 'query' => 'info', 'gcid' => $id, - 'lang' => $lang + 'lang' => $lang, + 'fields' => fields, + 'rclang' => $rclang ); $rs = $this->query( $param ); @@ -132,12 +134,13 @@ return $rs; }*/ - function getConceptsForTerm( $qlang, $term, $languages, $norm = 1, $limit = 100 ) { + function getConceptsForTerm( $qlang, $term, $languages, $norm = 1, $rclang = null, $limit = 100 ) { if ( is_array( $languages ) ) $languages = implode('|', $languages); $param = array( 'query' => 'concepts', 'qlang' => $qlang, + 'rclang' => $rclang, 'lang' => $languages, 'norm' => $norm, 'term' => $term, Modified: trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php =================================================================== --- trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php 2010-07-01 13:43:05 UTC (rev 68831) +++ trunk/WikiWord/WikiWordWeb/src/main/www/common/wwthesaurus.php 2010-07-01 14:50:27 UTC (rev 68832) @@ -72,20 +72,26 @@ return $s; } - function queryConceptsForTerm($qlang, $term, $languages, $norm = 1, $limit = 100) { + function queryConceptsForTerm($qlang, $term, $languages, $norm = 1, $rclang = null, $limit = 100) { global $wwTablePrefix, $wwThesaurusDataset, $wwLanguages; if ( !$languages ) $languages = array_keys( $wwLanguages ); $term = $this->normalizeSearchString($term, $norm); - $sql = "SELECT I.*, S.score FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I" - . " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_search_index as S ON I.concept = S.concept " - . " WHERE term = " . $this->quote($term) + $sql = "SELECT I.*, S.score "; + if ( $rclang ) $sql .= ", R.resources " + + $sql .= " FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I "; + $sql .= " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_search_index as S ON I.concept = S.concept "; + if ( $rclang ) $sql .= " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_resource_index as R ON R.concept = I.concept " + + $sql .= " WHERE term = " . $this->quote($term) . " AND I.lang IN " . $this->quoteSet($languages) . " AND S.lang = " . $this->quote($qlang) - . " AND S.norm <= " . (int)$norm - . " ORDER BY S.score DESC, S.concept " + . " AND S.norm <= " . (int)$norm; + + $sql .= " ORDER BY S.score DESC, S.concept " . " LIMIT " . (int)$limit; #FIXME: query-lang vs. output-languages! @@ -93,8 +99,8 @@ return $this->query($sql); } - function getConceptsForTerm($qlang, $term, $languages, $norm = 1, $limit = 100) { - $rs = $this->queryConceptsForTerm($qlang, $term, $languages); + function getConceptsForTerm($qlang, $term, $languages, $norm = 1, $rclang = null, $limit = 100) { + $rs = $this->queryConceptsForTerm($qlang, $term, $languages, $norm, $rclang, $limit); $list = WWUtils::slurpRows($rs); mysql_free_result($rs); return $this->buildConcepts($list); @@ -243,6 +249,23 @@ } */ + function spliceResources( $rc, &$into ) { + if (!$rc) return; + + if (is_string($rc)) { + $rr = explode("|", $rc); + + $rc = array(); + foreach ($rr as $r) { + list($t, $lang, $n) = explode(":", $p, 3); + $rc[$lang][$n] = (int)$t; + } + } + + if (!$into) $into = $rc; + else $into = array_merge( $into, $rc ); + } + function splitPages( $s ) { $pp = explode("|", $s); @@ -269,7 +292,7 @@ } ///////////////////////////////////////////////////////// - function getConceptInfo( $id, $lang = null, $fields = null ) { + function getConceptInfo( $id, $lang = null, $fields = null, $rclang = null ) { global $wwTablePrefix, $wwThesaurusDataset; #TODO: concept cache! @@ -279,9 +302,14 @@ #TODO: scores, concept-type, ... - $sql = "SELECT $fields FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info " - . " WHERE concept = ".(int)$id; + $sql = "SELECT $fields "; + if ( $rclang ) $sql .= ", R.resources "; + + $sql .= " FROM {$wwTablePrefix}_{$wwThesaurusDataset}_concept_info as I "; + if ( $rclang ) $sql .= " JOIN {$wwTablePrefix}_{$wwThesaurusDataset}_resource_index as R ON R.concept = I.concept "; + $sql .= " WHERE concept = ".(int)$id; + if ($lang) { if ( is_array($lang) ) $sql .= " AND lang IN " . $this->quoteSet($lang); else $sql .= " AND lang = " . $this->quote($lang); @@ -352,8 +380,10 @@ if (@$row["name"] !== null) $concept["name"][$lang] = $row["name"]; if (@$row["definition"] !== null) $concept["definition"][$lang] = $row["definition"]; - if (@$row["pages"] !== null) $concept["pages"][$lang] = $this->splitPages($row["pages"]); + if (@$row["resources"]) $this->spliceResources($row["resources"], &$concept["pages"]) ); + else if (@$row["pages"]) $concept["pages"][$lang] = $this->splitPages($row["pages"]); + if (@$row["broader"] !== null) $broader[$lang] = $this->splitConcepts($row["broader"]); if (@$row["narrower"] !== null) $narrower[$lang] = $this->splitConcepts($row["narrower"]); if (@$row["similar"] !== null) $similar[$lang] = $this->splitConcepts($row["similar"]); Modified: trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php =================================================================== --- trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php 2010-07-01 13:43:05 UTC (rev 68831) +++ trunk/WikiWord/WikiWordWeb/src/main/www/wikipics/search.php 2010-07-01 14:50:27 UTC (rev 68832) @@ -200,7 +200,7 @@ $mode = NULL; $result = NULL; -$fallback_languages = array( "en", "commons" ); #TODO: make the user define this list +$fallback_languages = array( "en" ); #TODO: make the user define this list if ( $lang ) { $languages = explode( '|', $lang ); @@ -222,11 +222,11 @@ try { if ($lang && $conceptId) { $mode = "concept"; - $result = $thesaurus->getConceptInfo($conceptId, $allLanguages); + $result = $thesaurus->getConceptInfo($conceptId, $lang, null, $allLanguages); if ( $result ) $result = array( $result ); //hack } else if ($lang && $term) { $mode = "term"; - $result = $thesaurus->getConceptsForTerm($lang, $term, $allLanguages, $norm, $limit); + $result = $thesaurus->getConceptsForTerm($lang, $term, $languages, $norm, $allLanguages, $limit); } } catch (Exception $e) { $error = $e->getMessage(); @@ -237,7 +237,8 @@ if (!isset($scriptPath)) $scriptPath = "./"; if (!isset($skinPath)) $skinPath = "$scriptPath/../skin/"; -if ( $format == "atom" || $format == "xml" || $format == "opensearch" ) include("response.atom.php"); -else include("response.html.php"); +/*if ( $format == "atom" || $format == "xml" || $format == "opensearch" ) include("response.atom.php"); +else*/ +include("response.html.php"); $utils->close(); Modified: trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php =================================================================== --- trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php 2010-07-01 13:43:05 UTC (rev 68831) +++ trunk/WikiWord/WikiWordWeb/src/main/www/wikiword/api.php 2010-07-01 14:50:27 UTC (rev 68832) @@ -10,16 +10,19 @@ if ( $query ) { $lang = @$_REQUEST['lang']; $qlang = @$_REQUEST['qlang']; + $rclang = @$_REQUEST['rclang']; $format = @$_REQUEST['format']; if ( !$format ) $format = 'phps'; if ($lang) $lang = preg_replace('[^\\w\\d_]', '', $lang); if ($qlang) $qlang = preg_replace('[^\\w\\d_]', '', $qlang); + + if ($rclang) $rclang = preg_replace('[^\\w\\d_]', '', $rclang); if ($lang) { - $lang = preg_split('![\\s,;|/:]\\s*!', $lang); - if ( !$qlang ) $qlang = $lang[0]; - if (count($lang) == 1) $lang = $lang[0]; + $lang = preg_split('![\\s,;|/:]\\s*!', $lang); + if ( !$qlang ) $qlang = $lang[0]; + if (count($lang) == 1) $lang = $lang[0]; } $result = array( 'query' => $query ); @@ -40,29 +43,30 @@ if ( $qlang === null ) $result['error'] = array('code' => 150, 'message' => "missing parameter qlang"); else if ( $term !== null ) { - $result['concepts'] = $thesaurus->getConceptsForTerm($qlang, $term, $lang, $norm); #TODO: limit! - if ( $result['concepts'] === false || $result['concepts'] === null ) { - $result['error'] = array('code' => 210, 'message' => "failed to retrieve concepts for term $langt:$term"); - } + $result['concepts'] = $thesaurus->getConceptsForTerm($qlang, $term, $lang, $norm, $rclang); #TODO: limit! + if ( $result['concepts'] === false || $result['concepts'] === null ) { + $result['error'] = array('code' => 210, 'message' => "failed to retrieve concepts for term $langt:$term"); + } } /*else if ( $page !== null ) { $result['concepts'] = $thesaurus->getConceptsForPage($lang, $page); if ( $result['concepts'] === false || $result['concepts'] === null ) { $result['error'] = array('code' => 250, 'message' => "failed to retrieve concepts for page $langt:$page"); } } */else { - $result['error'] = array('code' => 110, 'message' => "missing parameter term"); + $result['error'] = array('code' => 110, 'message' => "missing parameter term"); } } else if ($query == 'concept' || $query == 'info') { $gcid = @$_REQUEST['gcid']; if (!$gcid) $gcid = @$_REQUEST['id']; if ( $gcid === null ) { - $result['error'] = array('code' => 120, 'message' => "missing parameter gcid"); + $result['error'] = array('code' => 120, 'message' => "missing parameter gcid"); } else { - $result['concept'] = $thesaurus->getConceptInfo($gcid, $lang); #TODO: limit! - if ( $result['concept'] === false || $result['concept'] === null ) { - $result['error'] = array('code' => 210, 'message' => "concept not found: $gcid"); - } + $fields = null; + $result['concept'] = $thesaurus->getConceptInfo($gcid, $lang, $fields, $rclang); #TODO: limit! + if ( $result['concept'] === false || $result['concept'] === null ) { + $result['error'] = array('code' => 210, 'message' => "concept not found: $gcid"); + } } } else if ($query == 'properties') { $gcid = @$_REQUEST['gcid']; _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs