[ https://issues.apache.org/jira/browse/LUCENE-1039?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12630434#action_12630434 ]
Toby Segaran commented on LUCENE-1039: -------------------------------------- I'm the author of "Programming Collective Intelligence". I see no issue with property rights, the algorithm itself is widely known and my book just explains it. The code Karl wrote is completely original. > Bayesian classifiers using Lucene as data store > ----------------------------------------------- > > Key: LUCENE-1039 > URL: https://issues.apache.org/jira/browse/LUCENE-1039 > Project: Lucene - Java > Issue Type: New Feature > Reporter: Karl Wettin > Assignee: Karl Wettin > Priority: Minor > Attachments: LUCENE-1039.txt > > > Bayesian classifiers using Lucene as data store. Based on the Naive Bayes and > Fisher method algorithms as described by Toby Segaran in "Programming > Collective Intelligence", ISBN 978-0-596-52932-1. > Have fun. > Poor java docs, but the TestCase shows how to use it: > {code:java} > public class TestClassifier extends TestCase { > public void test() throws Exception { > InstanceFactory instanceFactory = new InstanceFactory() { > public Document factory(String text, String _class) { > Document doc = new Document(); > doc.add(new Field("class", _class, Field.Store.YES, > Field.Index.NO_NORMS)); > doc.add(new Field("text", text, Field.Store.YES, Field.Index.NO, > Field.TermVector.NO)); > doc.add(new Field("text/ngrams/start", text, Field.Store.NO, > Field.Index.TOKENIZED, Field.TermVector.YES)); > doc.add(new Field("text/ngrams/inner", text, Field.Store.NO, > Field.Index.TOKENIZED, Field.TermVector.YES)); > doc.add(new Field("text/ngrams/end", text, Field.Store.NO, > Field.Index.TOKENIZED, Field.TermVector.YES)); > return doc; > } > Analyzer analyzer = new Analyzer() { > private int minGram = 2; > private int maxGram = 3; > public TokenStream tokenStream(String fieldName, Reader reader) { > TokenStream ts = new StandardTokenizer(reader); > ts = new LowerCaseFilter(ts); > if (fieldName.endsWith("/ngrams/start")) { > ts = new EdgeNGramTokenFilter(ts, > EdgeNGramTokenFilter.Side.FRONT, minGram, maxGram); > } else if (fieldName.endsWith("/ngrams/inner")) { > ts = new NGramTokenFilter(ts, minGram, maxGram); > } else if (fieldName.endsWith("/ngrams/end")) { > ts = new EdgeNGramTokenFilter(ts, EdgeNGramTokenFilter.Side.BACK, > minGram, maxGram); > } > return ts; > } > }; > public Analyzer getAnalyzer() { > return analyzer; > } > }; > Directory dir = new RAMDirectory(); > new IndexWriter(dir, null, true).close(); > Instances instances = new Instances(dir, instanceFactory, "class"); > instances.addInstance("hello world", "en"); > instances.addInstance("hallå världen", "sv"); > instances.addInstance("this is london calling", "en"); > instances.addInstance("detta är london som ringer", "sv"); > instances.addInstance("john has a long mustache", "en"); > instances.addInstance("john har en lång mustache", "sv"); > instances.addInstance("all work and no play makes jack a dull boy", "en"); > instances.addInstance("att bara arbeta och aldrig leka gör jack en trist > gosse", "sv"); > instances.addInstance("shrimp sandwich", "en"); > instances.addInstance("räksmörgås", "sv"); > instances.addInstance("it's now or never", "en"); > instances.addInstance("det är nu eller aldrig", "sv"); > instances.addInstance("to tie up at a landing-stage", "en"); > instances.addInstance("att angöra en brygga", "sv"); > instances.addInstance("it's now time for the children's television > shows", "en"); > instances.addInstance("nu är det dags för barnprogram", "sv"); > instances.flush(); > testClassifier(instances, new NaiveBayesClassifier()); > testClassifier(instances, new FishersMethodClassifier()); > instances.close(); > } > private void testClassifier(Instances instances, BayesianClassifier > classifier) throws IOException { > assertEquals("sv", classifier.classify(instances, "detta blir ett > test")[0].getClassification()); > assertEquals("en", classifier.classify(instances, "this will be a > test")[0].getClassification()); > // test training data instances. all ought to match! > for (int documentNumber = 0; documentNumber < > instances.getIndexReader().maxDoc(); documentNumber++) { > if (!instances.getIndexReader().isDeleted(documentNumber)) { > Map<Term, Double> features = > instances.extractFeatures(instances.getIndexReader(), documentNumber, > classifier.isNormalized()); > Document document = > instances.getIndexReader().document(documentNumber); > assertEquals(document.get("class"), classifier.classify(instances, > features)[0].getClassification()); > } > } > } > {code} -- This message is automatically generated by JIRA. - You can reply to this email to add a comment to the issue online. --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]