Re: Sort difference between 2.1 and 2.3
You're right, Lucene changed wrt the 0x character: 2.3 now uses this character internally as an end of term marker when storing term text. This was done as part of LUCENE-843 (speeding up indexing). Technically that character is an invalid UTF16 character (for interchange), but it looks like a few Lucene users were indeed relying on older Lucene versions accepting preserving it. You could use 0xfffe instead? Lucene 2.3 will preserve it, though It's also invalid for interchange (so future Lucene versions might change wrt that, too). Or ... it looks like you're use case is to sort all last values after all first values? In which case one way to do this (without using invalid UTF16 characters) might be to add a new field marking whether you have a last or a first value, then sort first by that field and second by your value field? Mike Antony Bowesman [EMAIL PROTECTED] wrote: Hi, I had a test case that added two documents, each with one untokenized field, and sorted them. The data in each document was char(1) + First char(0x) + Last With Lucene 2.1 the documents are sorted correctly, but with Lucene 2.3.1, they are not. Looking at the index with Luke shows that the document with Last has not been handled correctly, i.e. the text for the subject field is empty. The test case below shows the problem. Regards Antony import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.junit.After; import org.junit.Before; import org.junit.Test; public class LastSubjectTest { /** * Set up a number of documents with 1 duplicate ContentId * @throws Exception */ @Before public void setUp() throws Exception { IndexWriter writer = new IndexWriter(TestDir/, new StandardAnalyzer(), true); Document doc = new Document(); String subject = new StringBuffer(1).append((char)0x).toString() + Last; Field f = new Field(subject, subject, Field.Store.YES, Field.Index.NO_NORMS); doc.add(f); writer.addDocument(doc); doc = new Document(); subject = new StringBuffer(1).append((char)0x1).toString() + First; f = new Field(subject, subject, Field.Store.YES, Field.Index.NO_NORMS); doc.add(f); writer.addDocument(doc); writer.close(); } /** * @throws Exception */ @After public void tearDown() throws Exception { } /** * Tests that the last is after first document, sorted by subject * @throws IOException */ @Test public void testSortDateAscending() throws IOException { IndexSearcher searcher = new IndexSearcher(TestDir/); Query q = new MatchAllDocsQuery(); Sort sort = new Sort(new SortField(subject)); Hits hits = searcher.search(q, sort); assertEquals(Hits should match all documents, searcher.getIndexReader().maxDoc(), hits.length()); Document fd = hits.doc(0); Document ld = hits.doc(1); String fs = fd.get(subject); String ls = ld.get(subject); for (int i = 0; i hits.length(); i++) { Document doc = hits.doc(i); String subject = doc.get(subject); System.out.println(Subject: + subject); } assertTrue(Subjects have been sorted incorrectly, fs.compareTo(ls) 0); } } - To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED] - To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
Re: Sort difference between 2.1 and 2.3
Thanks for the explanation Mike. It's not a big issue, it's just a test case where I was needed to ensure ordering for the test, so I'll just use a valid high utf-16 character. It just seemed odd that the field was showing strangely in Luke. Your explanation gives the reason, thanks. Antony Michael McCandless wrote: You're right, Lucene changed wrt the 0x character: 2.3 now uses this character internally as an end of term marker when storing term text. This was done as part of LUCENE-843 (speeding up indexing). Technically that character is an invalid UTF16 character (for interchange), but it looks like a few Lucene users were indeed relying on older Lucene versions accepting preserving it. You could use 0xfffe instead? Lucene 2.3 will preserve it, though It's also invalid for interchange (so future Lucene versions might change wrt that, too). Or ... it looks like you're use case is to sort all last values after all first values? In which case one way to do this (without using invalid UTF16 characters) might be to add a new field marking whether you have a last or a first value, then sort first by that field and second by your value field? Mike Antony Bowesman [EMAIL PROTECTED] wrote: Hi, I had a test case that added two documents, each with one untokenized field, and sorted them. The data in each document was char(1) + First char(0x) + Last With Lucene 2.1 the documents are sorted correctly, but with Lucene 2.3.1, they are not. Looking at the index with Luke shows that the document with Last has not been handled correctly, i.e. the text for the subject field is empty. The test case below shows the problem. Regards Antony import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.junit.After; import org.junit.Before; import org.junit.Test; public class LastSubjectTest { /** * Set up a number of documents with 1 duplicate ContentId * @throws Exception */ @Before public void setUp() throws Exception { IndexWriter writer = new IndexWriter(TestDir/, new StandardAnalyzer(), true); Document doc = new Document(); String subject = new StringBuffer(1).append((char)0x).toString() + Last; Field f = new Field(subject, subject, Field.Store.YES, Field.Index.NO_NORMS); doc.add(f); writer.addDocument(doc); doc = new Document(); subject = new StringBuffer(1).append((char)0x1).toString() + First; f = new Field(subject, subject, Field.Store.YES, Field.Index.NO_NORMS); doc.add(f); writer.addDocument(doc); writer.close(); } /** * @throws Exception */ @After public void tearDown() throws Exception { } /** * Tests that the last is after first document, sorted by subject * @throws IOException */ @Test public void testSortDateAscending() throws IOException { IndexSearcher searcher = new IndexSearcher(TestDir/); Query q = new MatchAllDocsQuery(); Sort sort = new Sort(new SortField(subject)); Hits hits = searcher.search(q, sort); assertEquals(Hits should match all documents, searcher.getIndexReader().maxDoc(), hits.length()); Document fd = hits.doc(0); Document ld = hits.doc(1); String fs = fd.get(subject); String ls = ld.get(subject); for (int i = 0; i hits.length(); i++) { Document doc = hits.doc(i); String subject = doc.get(subject); System.out.println(Subject: + subject); } assertTrue(Subjects have been sorted incorrectly, fs.compareTo(ls) 0); } } - To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED] - To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED] - To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
Sort difference between 2.1 and 2.3
Hi, I had a test case that added two documents, each with one untokenized field, and sorted them. The data in each document was char(1) + First char(0x) + Last With Lucene 2.1 the documents are sorted correctly, but with Lucene 2.3.1, they are not. Looking at the index with Luke shows that the document with Last has not been handled correctly, i.e. the text for the subject field is empty. The test case below shows the problem. Regards Antony import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.junit.After; import org.junit.Before; import org.junit.Test; public class LastSubjectTest { /** * Set up a number of documents with 1 duplicate ContentId * @throws Exception */ @Before public void setUp() throws Exception { IndexWriter writer = new IndexWriter(TestDir/, new StandardAnalyzer(), true); Document doc = new Document(); String subject = new StringBuffer(1).append((char)0x).toString() + Last; Field f = new Field(subject, subject, Field.Store.YES, Field.Index.NO_NORMS); doc.add(f); writer.addDocument(doc); doc = new Document(); subject = new StringBuffer(1).append((char)0x1).toString() + First; f = new Field(subject, subject, Field.Store.YES, Field.Index.NO_NORMS); doc.add(f); writer.addDocument(doc); writer.close(); } /** * @throws Exception */ @After public void tearDown() throws Exception { } /** * Tests that the last is after first document, sorted by subject * @throws IOException */ @Test public void testSortDateAscending() throws IOException { IndexSearcher searcher = new IndexSearcher(TestDir/); Query q = new MatchAllDocsQuery(); Sort sort = new Sort(new SortField(subject)); Hits hits = searcher.search(q, sort); assertEquals(Hits should match all documents, searcher.getIndexReader().maxDoc(), hits.length()); Document fd = hits.doc(0); Document ld = hits.doc(1); String fs = fd.get(subject); String ls = ld.get(subject); for (int i = 0; i hits.length(); i++) { Document doc = hits.doc(i); String subject = doc.get(subject); System.out.println(Subject: + subject); } assertTrue(Subjects have been sorted incorrectly, fs.compareTo(ls) 0); } } - To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]