http://www.mediawiki.org/wiki/Special:Code/MediaWiki/68826
Revision: 68826 Author: daniel Date: 2010-07-01 10:31:44 +0000 (Thu, 01 Jul 2010) Log Message: ----------- make coherence score more sensitive to frequency: normalize before averaging and use a softer normalization curve Modified Paths: -------------- trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java Modified: trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java =================================================================== --- trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java 2010-07-01 09:52:28 UTC (rev 68825) +++ trunk/WikiWord/WikiWord/src/main/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguator.java 2010-07-01 10:31:44 UTC (rev 68826) @@ -77,7 +77,10 @@ if (pop<0.5) return 0; if (pop<1) pop=1; - double n = 1 - 1/(Math.sqrt(Math.log(pop))+1); //XXX: black voodoo magic ad hoc formula with no deeper meaing. + //XXX: black voodoo magic ad hoc formula with no deeper meaing. + //double n = 1 - 1/(Math.log(pop)+1); //normalized log scale + //double n = 1 - 1/(Math.sqrt(Math.log(pop))+1); //dampened normalized log scale + double n =1 - (0.5/Math.sqrt((pop+200)/200)); //nice and smooth, but has magic params that may depend on the wiki return n; } }; @@ -436,7 +439,8 @@ LabeledVector<Integer> sum = ConceptFeatures.newIntFeaturVector( concepts.size() * 200 ); //XXX: magic number Map<Integer, ConceptFeatures<C, Integer>> disambigFeatures = new HashMap<Integer, ConceptFeatures<C, Integer>>(); - double sim = 0, pop = 0, weight = 0; + double sim = 0, pop = 0, weight = 0, popf = 0, simf = 0; + int i=0, j=0; for (Map.Entry<TermReference, C> ea: concepts.entrySet()) { C a = ea.getValue(); @@ -496,6 +500,8 @@ d = doubleSanity(d, "normal similarity score for "+a+" / "+b, "check similarityMeasure!", 0, 0.1, 1, 0.1); sim += d; + simf += similarityNormalizer.apply(d); + simCount ++; } @@ -508,7 +514,9 @@ p = weightCombiner.apply(p, w); - pop += p; //XXX: keep raw and processed pop + pop += p; + popf += popularityNormalizer.apply(p); + weight += w; } @@ -523,14 +531,15 @@ sim = n == 0 ? 0 : sim / n; //scale pop = c == 0 ? 0 : pop / c; //scale + + simf = n == 0 ? 0 : simf / n; //scale + popf = c == 0 ? 0 : popf / c; //scale + weight = c == 0 ? 0 : weight / c; //scale pop = doubleSanity(pop, "normal popularity", "check popularityMeasure!", 0, 0.1, Double.MAX_VALUE, 0); sim = doubleSanity(sim, "normal average simility", "ooops!", 0, 0.1, 1, 0.1); - double popf = popularityNormalizer.apply(pop); - double simf = similarityNormalizer.apply(sim); - popf = doubleSanity(popf, "normal popularity", "check popularityNormalizer!", 0, 0.1, 1, 0.1); simf = doubleSanity(simf, "normal similarity", "check similarityNormalizer!", 0, 0.1, 1, 0.1); Modified: trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java =================================================================== --- trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java 2010-07-01 09:52:28 UTC (rev 68825) +++ trunk/WikiWord/WikiWord/src/test/java/de/brightbyte/wikiword/disambig/CoherenceDisambiguatorTest.java 2010-07-01 10:31:44 UTC (rev 68826) @@ -6,10 +6,13 @@ import java.util.List; import java.util.Map; +import de.brightbyte.data.LabeledMatrix; +import de.brightbyte.data.MapLabeledMatrix; import de.brightbyte.data.Pair; import de.brightbyte.io.ConsoleIO; import de.brightbyte.io.Output; import de.brightbyte.util.PersistenceException; +import de.brightbyte.wikiword.disambig.CoherenceDisambiguator.CoherenceDisambiguation; import de.brightbyte.wikiword.disambig.Disambiguator.Interpretation; import de.brightbyte.wikiword.disambig.Disambiguator.Disambiguation; import de.brightbyte.wikiword.model.LocalConcept; @@ -24,6 +27,33 @@ super(); } + public void testGetScore() throws PersistenceException { + CoherenceDisambiguator disambiguator = new CoherenceDisambiguator(meaningFetcher, featureFetcher, 10); + + LabeledMatrix<LocalConcept, LocalConcept> similarities = new MapLabeledMatrix<LocalConcept, LocalConcept>(true); + + LocalConcept city_of_London = getConcept("City_of_London"); + LocalConcept united_Kingdom = getConcept("United_Kingdom"); + + //united_Kingdom.setCardinality(100000); + + Pair<Term, LocalConcept> uk_as_United_Kingdom = new Pair<Term, LocalConcept>(new Term("UK"), united_Kingdom); + Pair<Term, LocalConcept> london_as_City_of_London = new Pair<Term, LocalConcept>(new Term("London"), city_of_London); + + CoherenceDisambiguator.Interpretation interp = new CoherenceDisambiguator.Interpretation(uk_as_United_Kingdom, london_as_City_of_London); + CoherenceDisambiguation r1 = disambiguator.getScore(interp, null, similarities, featureFetcher); + + int oldPop = city_of_London.getCardinality(); + city_of_London.setCardinality(oldPop*2); + + CoherenceDisambiguation r2 = disambiguator.getScore(interp, null, similarities, featureFetcher); + city_of_London.setCardinality(oldPop); + + double score1 = r1.getScore(); + double score2 = r2.getScore(); + assertTrue("More popularity implies better score", score1 < score2 ); + } + public void testGetSequenceInterpretations() throws PersistenceException { CoherenceDisambiguator disambiguator = new CoherenceDisambiguator(meaningFetcher, featureFetcher, 10); _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs