Author: ogrisel
Date: Mon May 7 12:43:41 2012
New Revision: 1334978
URL: http://svn.apache.org/viewvc?rev=1334978&view=rev
Log:
STANBOL-197: started documentation for sample topic models training
Added:
incubator/stanbol/trunk/enhancer/topic-web/tools/README.md
incubator/stanbol/trunk/enhancer/topic-web/tools/dbpediacategories.py
Modified:
incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py
Added: incubator/stanbol/trunk/enhancer/topic-web/tools/README.md
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/topic-web/tools/README.md?rev=1334978&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/topic-web/tools/README.md (added)
+++ incubator/stanbol/trunk/enhancer/topic-web/tools/README.md Mon May 7
12:43:41 2012
@@ -0,0 +1,56 @@
+# Helper Scripts to build training set and classifier models
+
+Before using any of the following script you should configure a new
+classifier model identified `model` for instance using the Felix System
+Console at http://localhost:8080/system/console matching training set.
+The HTTP API for that classifier model will be published at:
+
+ http://localhost:8080/topic/model
+
+
+## Using NewsML documents with IPTC subjects annotation
+
+NewsML is standard XML file format used by major news agencies. The
+topic of news articles can be categorized using a controlled vocabulary.
+
+Such vocabulary can be loaded in the entityhub by copy the IPTC [zip
+archive][1] in the `stanbol/datafiles` folder of a running server and
+deploy the [referenced site definition jar][2] (for instance using the
+Felix Console).
+
+[1] http://dev.iks-project.eu/downloads/stanbol-indices/iptc.solrindex.zip
+[2]
http://dev.iks-project.eu/downloads/stanbol-indices/org.apache.stanbol.data.site.iptc-1.0.0.jar
+
+If you have an archive of NewsML files at hand you can train a topic
+classifier on by using the files to build the training set for the model
+(you need Python 2.7 and lxml to run the script).
+
+First import the RDF definition of the IPTC taxonomy into the model:
+
+ TODO
+
+Then import the data into the training set of the model:
+
+ python newsmlimporter.py /path/to/newml/topleve/folder 10000 \
+ http://localhost:8080/topic/model/trainingset
+
+The second argument is the maximum number of news to import in the
+training set.
+
+You can then train the model with curl:
+
+ curl -i -X POST http://localhost:8080/topic/model/trainer?incremental=false
+
+The model can then be used as part of any enhancer engine chain to assign
+IPTC topics to text documents.
+
+
+## Using DBpedia categories
+
+A subset of Wikipedia / DBpedia categories can be used as a classifier. To
+extract such a taxonomy of topics you can use [dbpediakit][3]:
+
+[3] https://github.com/ogrisel/dbpediakit
+
+ python dbpediacategories.py topics.tsv examples.tsv \
+ http://localhost:8080/topic/model
Added: incubator/stanbol/trunk/enhancer/topic-web/tools/dbpediacategories.py
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/topic-web/tools/dbpediacategories.py?rev=1334978&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/topic-web/tools/dbpediacategories.py
(added)
+++ incubator/stanbol/trunk/enhancer/topic-web/tools/dbpediacategories.py Mon
May 7 12:43:41 2012
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""""Build a classifier using a subset of the DBpedia categories"""
+from __future__ import print_function
+
+from bz2 import BZ2File
+from time import time
+import urllib2
+from urllib import quote
+
+DBPEDIA_URL_PREFIX = "http://dbpedia.org/resource/"
+
+
+def load_topics_from_tsv(filename, server_url):
+ lines = open(filename, 'rb').readlines()
+
+ count = 0
+ previous = time()
+
+ for line in lines:
+ concept, broader_concepts, primary_topic = line.split('\t')
+ primary_topic = DBPEDIA_URL_PREFIX + primary_topic.strip()
+ concept = DBPEDIA_URL_PREFIX + concept.strip()
+ if broader_concepts == '\\N':
+ # postgresql marker for NULL values in TSV files
+ broader_concepts = []
+ else:
+ broader_concepts = [DBPEDIA_URL_PREFIX + b.strip()
+ for b in broader_concepts.split()]
+
+ url = server_url + "?id=%s&primaryTopic=%s" % (
+ concept, primary_topic)
+
+ for broader_concept in broader_concepts:
+ url += "&broader=%s" % quote(broader_concept)
+
+ # force POST verb with data keyword
+ request = urllib2.Request(url, data="")
+ opener = urllib2.build_opener()
+ opener.open(request).read()
+
+ count += 1
+ if count % 1000 == 0:
+ delta, previous = time() - previous, time()
+ print("Imported concepts %03d/%03d in %06.3fs"
+ % (count, len(lines), delta))
+
+
+def load_examples_from_tsv(filename, server_url):
+ if filename.endswith('.bz2'):
+ lines = BZ2File(filename).readlines()
+ else:
+ lines = open(filename, 'rb').readlines()
+
+ count = 0
+ previous = time()
+
+ for line in lines:
+ example_id, categories, text = line.split('\t')
+ example_id = DBPEDIA_URL_PREFIX + example_id
+ categories = [DBPEDIA_URL_PREFIX + c for c in categories.split()]
+
+ url = server_url + "?example_id=%s" % example_id
+ for category in categories:
+ url += "&concept=%s" % quote(category)
+ request = urllib2.Request(url, data=text)
+ request.add_header('Content-Type', 'text/plain')
+ opener = urllib2.build_opener()
+ opener.open(request).read()
+
+ count += 1
+ if count % 1000 == 0:
+ delta, previous = time() - previous, time()
+ print("Processed articles %03d/%03d in %06.3fs"
+ % (count, len(lines), delta))
+
+
+if __name__ == "__main__":
+ import sys
+ topics_filename = sys.argv[1]
+ examples_filename = sys.argv[2]
+ topic_model_url = sys.argv[3]
+
+ print("Loading taxonomy definition from:", topics_filename)
+ t0 = time()
+ load_topics_from_tsv(topics_filename,
+ topic_model_url + '/concept')
+ print("Taxonomy loaded in %0.3fs." % (time() - t0))
+
+ print("Loading training set from:", examples_filename)
+ t0 = time()
+ load_examples_from_tsv(examples_filename,
+ topic_model_url + '/trainingset')
+ print("Dataset loaded in %0.3fs." % (time() - t0))
+
+ print("Training model from dataset...")
+ # Force usage of the POST HTTP verb:
+ t0 = time()
+ request = urllib2.Request(topic_model_url + '/trainer', data="")
+ opener = urllib2.build_opener().open(request).read()
+ print("Model updated in %0.3fs." % (time() - t0))
Modified: incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py?rev=1334978&r1=1334977&r2=1334978&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py
(original)
+++ incubator/stanbol/trunk/enhancer/topic-web/tools/newsmlimporter.py Mon May
7 12:43:41 2012
@@ -58,7 +58,7 @@ def find_text_and_subjects(newsml_conten
def register_newsml_document(text, codes, url):
- id = sha1(text).hexdigest()
+ id = sha1(text.encode('utf-8')).hexdigest()
url += "?example_id=%s" % id
for code in codes:
url += "&concept=%s" % quote(code)