I found a bug in the LangDetect implementation of language detection, where the
maxTotalChars property isn't doing what it's description says it does: Solr uses
the append() method solely in the LangDetect library, which checks the string
length of the text to be appended and not its entire contents [1].
I've got a patch (attached) that solves this issue and hoists out a few of the
utility methods in the Tika implementation and reuses them in the LangDetect
one, but I stumbled upon SOLR-3881 [2], where the methods (concatFields and
getExpectedSize specifically) were taken out of the parent class for reasons
that are sort of unclear from the comments.
Could I get some historical context on the issue and feedback on my patch?
Thanks
[1]
https://github.com/shuyo/language-detection/blob/master/src/com/cybozu/labs/langdetect/Detector.java#L170
[2] https://issues.apache.org/jira/browse/SOLR-3881
diff --git
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
index b1c3cba..f100ca8 100644
---
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
+++
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
@@ -47,39 +47,21 @@ public class LangDetectLanguageIdentifierUpdateProcessor
extends LanguageIdentif
@Override
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
try {
- Detector detector = DetectorFactory.create();
- detector.setMaxTextLength(maxTotalChars);
-
- for (String fieldName : inputFields) {
- log.debug("Appending field " + fieldName);
- if (doc.containsKey(fieldName)) {
- Collection<Object> fieldValues = doc.getFieldValues(fieldName);
- if (fieldValues != null) {
- for (Object content : fieldValues) {
- if (content instanceof String) {
- String stringContent = (String) content;
- if (stringContent.length() > maxFieldValueChars) {
- detector.append(stringContent.substring(0,
maxFieldValueChars));
- } else {
- detector.append(stringContent);
- }
- detector.append(" ");
- } else {
- log.warn("Field " + fieldName + " not a String value, not
including in detection");
- }
- }
- }
+ String content = concatFields(doc);
+ if (content.length() != 0) {
+ Detector detector = DetectorFactory.create();
+ detector.append(content);
+ ArrayList<Language> langlist = detector.getProbabilities();
+ ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
+ for (Language l: langlist) {
+ solrLangList.add(new DetectedLanguage(l.lang, l.prob));
}
+ return solrLangList;
}
- ArrayList<Language> langlist = detector.getProbabilities();
- ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
- for (Language l: langlist) {
- solrLangList.add(new DetectedLanguage(l.lang, l.prob));
- }
- return solrLangList;
} catch (LangDetectException e) {
log.debug("Could not determine language, returning empty list: ", e);
return Collections.emptyList();
}
+ return Collections.emptyList();
}
}
diff --git
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
index 6b85c9b..05039cc 100644
---
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
+++
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@@ -276,6 +276,58 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
}
/**
+ * Calculate expected string size of a Solr document with a list of fields
to use
+ * @param doc solr input document
+ * @return expected size of string value
+ */
+ private int getExpectedSize(SolrInputDocument doc) {
+ int docSize = 0;
+ for (String field : inputFields) {
+ Collection<Object> contents = doc.getFieldValues(field);
+ for (Object content : contents) {
+ if (content instanceof String) {
+ docSize += Math.min(((String) content).length(), maxFieldValueChars);
+ }
+ }
+ docSize = Math.min(docSize, maxTotalChars);
+ }
+ return docSize;
+ }
+
+ /**
+ * Concatenates content from multiple fields, respecting maximum field
length setting
+ */
+ protected String concatFields(SolrInputDocument doc) {
+ StringBuilder sb = new StringBuilder(getExpectedSize(doc));
+ for (String fieldName : inputFields) {
+ log.debug("Appending field " + fieldName);
+ if (doc.containsKey(fieldName)) {
+ Collection<Object> fieldValues = doc.getFieldValues(fieldName);
+ if (fieldValues != null) {
+ for (Object content : fieldValues) {
+ if (content instanceof String) {
+ String stringContent = (String) content;
+ if (stringContent.length() > maxFieldValueChars) {
+ sb.append(stringContent.substring(0, maxFieldValueChars));
+ } else {
+ sb.append(stringContent);
+ }
+ sb.append(" ");
+ if (sb.length() > maxTotalChars) {
+ sb.setLength(maxTotalChars);
+ break;
+ }
+ } else {
+ log.warn("Field " + fieldName + " not a String value, not
including in detection");
+ }
+ }
+ }
+ }
+ }
+ return sb.toString();
+ }
+
+ /**
* Decides the fallback language, either from content of fallback field or
fallback value
* @param doc the Solr document
* @param fallbackFields an array of strings with field names containing
fallback language codes
diff --git
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
index 5d12174..229e08d 100644
---
a/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
+++
b/solr/contrib/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
@@ -61,59 +61,4 @@ public class TikaLanguageIdentifierUpdateProcessor extends
LanguageIdentifierUpd
}
return languages;
}
-
-
- /**
- * Concatenates content from multiple fields
- */
- protected String concatFields(SolrInputDocument doc) {
- StringBuilder sb = new StringBuilder(getExpectedSize(doc, inputFields));
- for (String fieldName : inputFields) {
- log.debug("Appending field " + fieldName);
- if (doc.containsKey(fieldName)) {
- Collection<Object> fieldValues = doc.getFieldValues(fieldName);
- if (fieldValues != null) {
- for (Object content : fieldValues) {
- if (content instanceof String) {
- String stringContent = (String) content;
- if (stringContent.length() > maxFieldValueChars) {
- sb.append(stringContent.substring(0, maxFieldValueChars));
- } else {
- sb.append(stringContent);
-}
- sb.append(" ");
- if (sb.length() > maxTotalChars) {
- sb.setLength(maxTotalChars);
- break;
- }
- } else {
- log.warn("Field " + fieldName + " not a String value, not
including in detection");
- }
- }
- }
- }
- }
- return sb.toString();
- }
-
- /**
- * Calculate expected string size.
- *
- * @param doc solr input document
- * @param fields fields to select
- * @return expected size of string value
- */
- private int getExpectedSize(SolrInputDocument doc, String[] fields) {
- int docSize = 0;
- for (String field : fields) {
- Collection<Object> contents = doc.getFieldValues(field);
- for (Object content : contents) {
- if (content instanceof String) {
- docSize += Math.min(((String) content).length(), maxFieldValueChars);
- }
- }
- docSize = Math.min(docSize, maxTotalChars);
- }
- return docSize;
- }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]