I would like to strip html tags for indexing. Here is a simple example I 
tried so far, but doesn't seem to strip html tags. Any ideas what's missing?

//settings & Mappings
POST twitter
{
  "mappings": {
    "tweet" : {
      "properties" : {
        "message" : {
          "type" :    "string",
          "analyzer": "strip_html_analyzer"
        },
        "date" : {
          "type" :   "date"
        },
        "name" : {
          "type" :   "string"
        }
      }
    }
  },
  "settings": {
    "analysis": {
      "analyzer": {
        "strip_html_analyzer":{
            "type":"custom",
            "tokenizer":"standard",
            "filter":"standard",
            "char_filter":"my_html"
        }
      },
      "char_filter": {
          "my_html":{
              "type":"html_strip"
          }
      }
    }
  }
}


//Index a document
PUT /twitter/tweet/1
{
    "name" : "mike",
    "date" : "2009-11-15T14:12:12",
    "message" : "<html>trying out <b>Elasticsearch</b>, This is an html 
test</html>"
}


//query result for "html", I expect the query to return nothing since it is 
supposed to strip the tag?
"hits": {
      "total": 1,
      "max_score": 0.11626227,
      "hits": [
         {
            "_index": "twitter",
            "_type": "tweet",
            "_id": "1",
            "_score": 0.11626227,
            "fields": {
               "message": [
                  "<html>trying out <b>Elasticsearch</b>, This is an html 
test</html>"
               ]
            },
            "highlight": {
               "message": [
                  "<html>trying out <b>Elasticsearch</b>, This is an 
<em>html</em> test</html>"
               ]
            }
         }
      ]
   }




-- 
You received this message because you are subscribed to the Google Groups 
"elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/elasticsearch/517fe8b8-0b38-4646-bc8f-a27896454515%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to