http://www.mediawiki.org/wiki/Special:Code/MediaWiki/68826

Revision: 68826
Author:   daniel
Date:     2010-07-01 10:31:44 +0000 (Thu, 01 Jul 2010)

Log Message:
-----------
make coherence score more sensitive to frequency: normalize before averaging 
and use a softer normalization curve

Modified Paths:
--------------
    
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
    
trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java

Modified: 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
===================================================================
--- 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
   2010-07-01 09:52:28 UTC (rev 68825)
+++ 
trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java
   2010-07-01 10:31:44 UTC (rev 68826)
@@ -77,7 +77,10 @@
                        if (pop<0.5) return 0;
                        if (pop<1) pop=1;
                        
-                       double n = 1 - 1/(Math.sqrt(Math.log(pop))+1); //XXX: 
black voodoo magic ad hoc formula with no deeper meaing.
+                       //XXX: black voodoo magic ad hoc formula with no deeper 
meaing.
+                       //double n = 1 - 1/(Math.log(pop)+1); //normalized log 
scale 
+                       //double n = 1 - 1/(Math.sqrt(Math.log(pop))+1); 
//dampened normalized log scale
+                       double n =1 - (0.5/Math.sqrt((pop+200)/200)); //nice 
and smooth, but has magic params that may depend on the wiki
                        return n;  
                }
        };
@@ -436,7 +439,8 @@
                
                LabeledVector<Integer> sum = 
ConceptFeatures.newIntFeaturVector( concepts.size() * 200 ); //XXX: magic number
                Map<Integer, ConceptFeatures<C, Integer>> disambigFeatures = 
new HashMap<Integer, ConceptFeatures<C, Integer>>();
-               double sim = 0, pop = 0, weight = 0;
+               double sim = 0, pop = 0, weight = 0, popf = 0, simf = 0;
+
                int i=0, j=0;
                for (Map.Entry<TermReference, C> ea: concepts.entrySet()) {
                        C a = ea.getValue();
@@ -496,6 +500,8 @@
                                d = doubleSanity(d, "normal similarity score 
for "+a+" / "+b, "check similarityMeasure!", 0, 0.1, 1, 0.1);
                                
                                sim += d;
+                               simf += similarityNormalizer.apply(d);          
+
                                simCount ++;
                        }
                        
@@ -508,7 +514,9 @@
                        
                        p = weightCombiner.apply(p, w);
                        
-                       pop += p; //XXX: keep raw and processed pop 
+                       pop += p;  
+                       popf += popularityNormalizer.apply(p);
+                       
                        weight += w; 
                }
                
@@ -523,14 +531,15 @@
                
                sim = n == 0 ? 0 : sim / n; //scale
                pop = c == 0 ? 0 : pop / c; //scale
+
+               simf = n == 0 ? 0 : simf / n; //scale
+               popf = c == 0 ? 0 : popf / c; //scale
+
                weight = c == 0 ? 0 : weight / c; //scale
                
                pop = doubleSanity(pop, "normal popularity", "check 
popularityMeasure!", 0, 0.1, Double.MAX_VALUE, 0);
                sim = doubleSanity(sim, "normal average simility", "ooops!", 0, 
0.1, 1, 0.1);
                
-               double popf = popularityNormalizer.apply(pop);
-               double simf = similarityNormalizer.apply(sim);
-
                popf = doubleSanity(popf, "normal popularity", "check 
popularityNormalizer!", 0, 0.1, 1, 0.1);
                simf = doubleSanity(simf, "normal similarity", "check 
similarityNormalizer!", 0, 0.1, 1, 0.1);
                

Modified: 
trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java
===================================================================
--- 
trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java
       2010-07-01 09:52:28 UTC (rev 68825)
+++ 
trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java
       2010-07-01 10:31:44 UTC (rev 68826)
@@ -6,10 +6,13 @@
 import java.util.List;
 import java.util.Map;
 
+import de.brightbyte.data.LabeledMatrix;
+import de.brightbyte.data.MapLabeledMatrix;
 import de.brightbyte.data.Pair;
 import de.brightbyte.io.ConsoleIO;
 import de.brightbyte.io.Output;
 import de.brightbyte.util.PersistenceException;
+import 
de.brightbyte.wikiword.disambig.CoherenceDisambiguator.CoherenceDisambiguation;
 import de.brightbyte.wikiword.disambig.Disambiguator.Interpretation;
 import de.brightbyte.wikiword.disambig.Disambiguator.Disambiguation;
 import de.brightbyte.wikiword.model.LocalConcept;
@@ -24,6 +27,33 @@
                super();
        }
 
+       public void testGetScore() throws PersistenceException {
+               CoherenceDisambiguator disambiguator = new 
CoherenceDisambiguator(meaningFetcher, featureFetcher, 10);
+               
+               LabeledMatrix<LocalConcept, LocalConcept> similarities = new 
MapLabeledMatrix<LocalConcept, LocalConcept>(true);
+               
+               LocalConcept city_of_London = getConcept("City_of_London");
+               LocalConcept united_Kingdom = getConcept("United_Kingdom");
+               
+               //united_Kingdom.setCardinality(100000);
+               
+               Pair<Term, LocalConcept> uk_as_United_Kingdom = new Pair<Term, 
LocalConcept>(new Term("UK"), united_Kingdom);
+               Pair<Term, LocalConcept> london_as_City_of_London = new 
Pair<Term, LocalConcept>(new Term("London"), city_of_London);
+
+               CoherenceDisambiguator.Interpretation interp = new 
CoherenceDisambiguator.Interpretation(uk_as_United_Kingdom, 
london_as_City_of_London);
+               CoherenceDisambiguation r1 = disambiguator.getScore(interp, 
null, similarities, featureFetcher);
+               
+               int oldPop = city_of_London.getCardinality();
+               city_of_London.setCardinality(oldPop*2);
+
+               CoherenceDisambiguation r2 = disambiguator.getScore(interp, 
null, similarities, featureFetcher);
+               city_of_London.setCardinality(oldPop);
+               
+               double score1 = r1.getScore();
+               double score2 = r2.getScore();
+               assertTrue("More popularity implies better score", score1 < 
score2 );
+       }
+       
        public void testGetSequenceInterpretations() throws 
PersistenceException {
                CoherenceDisambiguator disambiguator = new 
CoherenceDisambiguator(meaningFetcher, featureFetcher, 10);
                



_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to