Revision: 58746 Author: jhoffman Date: 2017-03-02 12:16:37 +0100 (Thu, 02 Mar 2017) Log Message: ----------- Moved project from forge.onehippo.org/svn/ without history
Added Paths: ----------- attic/forge/wikipediaimport/ attic/forge/wikipediaimport/branches/ attic/forge/wikipediaimport/tags/ attic/forge/wikipediaimport/trunk/ attic/forge/wikipediaimport/trunk/README.TXT attic/forge/wikipediaimport/trunk/filter.json attic/forge/wikipediaimport/trunk/find_cycles.php attic/forge/wikipediaimport/trunk/get_category_tree.php attic/forge/wikipediaimport/trunk/hash_importer.php attic/forge/wikipediaimport/trunk/importer.php attic/forge/wikipediaimport/trunk/library/ attic/forge/wikipediaimport/trunk/library/jcr/ attic/forge/wikipediaimport/trunk/library/jcr/node.php attic/forge/wikipediaimport/trunk/library/jcr/parser.php attic/forge/wikipediaimport/trunk/library/jcr/property.php attic/forge/wikipediaimport/trunk/library/jcr/value.php attic/forge/wikipediaimport/trunk/library/wiki/ attic/forge/wikipediaimport/trunk/library/wiki/parseRaw.inc.php attic/forge/wikipediaimport/trunk/library/wiki/table-converter.inc.php attic/forge/wikipediaimport/trunk/list_top_categories.php attic/forge/wikipediaimport/trunk/listcategories.php attic/forge/wikipediaimport/trunk/resources/ attic/forge/wikipediaimport/trunk/resources/pom.xml attic/forge/wikipediaimport/trunk/resources/wikipedia-data.xml attic/forge/wikipediaimport/trunk/resources/wikipedia-namespace.xml attic/forge/wikipediaimport/trunk/resources/wikipedia.cnd attic/forge/wikipediaimport/trunk/run_tests.php attic/forge/wikipediaimport/trunk/source/ attic/forge/wikipediaimport/trunk/source/Category.php attic/forge/wikipediaimport/trunk/source/CategoryCycleDetector.php attic/forge/wikipediaimport/trunk/source/CategoryFilter.php attic/forge/wikipediaimport/trunk/source/CategoryHandler.php attic/forge/wikipediaimport/trunk/source/HippoExtensionWriter.php attic/forge/wikipediaimport/trunk/source/IWikiHandler.php attic/forge/wikipediaimport/trunk/source/JcrDocument.php attic/forge/wikipediaimport/trunk/source/JcrDocumentWriter.php attic/forge/wikipediaimport/trunk/source/JcrHashFolder.php attic/forge/wikipediaimport/trunk/source/JcrHashFolderWriter.php attic/forge/wikipediaimport/trunk/source/JcrWikiFolder.php attic/forge/wikipediaimport/trunk/source/MavenProject.php attic/forge/wikipediaimport/trunk/source/OrderedList.php attic/forge/wikipediaimport/trunk/source/TestCase.php attic/forge/wikipediaimport/trunk/source/WikiDocument.php attic/forge/wikipediaimport/trunk/source/WikiParser.php attic/forge/wikipediaimport/trunk/source/WriterTestCase.php attic/forge/wikipediaimport/trunk/source/main/ attic/forge/wikipediaimport/trunk/source/main/resources/ attic/forge/wikipediaimport/trunk/test/ attic/forge/wikipediaimport/trunk/test/CategoryCycleDetectorTest.php attic/forge/wikipediaimport/trunk/test/CategoryFilterTest.php attic/forge/wikipediaimport/trunk/test/CategoryTest.php attic/forge/wikipediaimport/trunk/test/HippoExtensionWriterTest.php attic/forge/wikipediaimport/trunk/test/JcrDocumentWriterTest.php attic/forge/wikipediaimport/trunk/test/JcrHashFolderWriterTest.php attic/forge/wikipediaimport/trunk/test/JcrXmlParserTest.php attic/forge/wikipediaimport/trunk/test/MavenProjectTest.php attic/forge/wikipediaimport/trunk/test/OrderedListTest.php attic/forge/wikipediaimport/trunk/test/TestCaseTest.php attic/forge/wikipediaimport/trunk/test/WikiDocumentTest.php attic/forge/wikipediaimport/trunk/test/WriterTestCaseTest.php Added: attic/forge/wikipediaimport/trunk/README.TXT =================================================================== --- attic/forge/wikipediaimport/trunk/README.TXT (rev 0) +++ attic/forge/wikipediaimport/trunk/README.TXT 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,69 @@ +Wikipedia content import +--------------------------------- + +Contents +1. Using the pre-generated jar's + 1.1 Placing the jar in your local maven repository + 1.2 Add an dependency in your applications pom file +2. Generating your own jar's +3. Known issues + + +1. Using the pre-generated jar's +================================ +When using the pre-generated jar's, there are just a few steps you need to take. +- You need to place the jar in your local maven repository +- You need to add an dependency in your applications pom file + +=== 1.1 Placing the jar in your local maven repository === +Copy the following command to your command-line and let maven do the work for you. +Be sure to modify -Dfile option to point to the desired jar. + +mvn install:install-file \ + -Dfile=wikipedia-content-1.0.0-100.jar \ + -DgroupId=org.onehippo.addon.content.wikipedia \ + -DartifactId=wikipedia-content \ + -Dversion=1.0.0 \ + -Dpackaging=jar \ + -DgeneratePom=true + +Alternatively you could copy the jar yourself to the correct location in your maven +repository if you know the layout. + +=== 1.2 Add an dependency in your applications pom file === +Open your applications pom file in your favorite texteditor and add the following to +your <dependencies> section. + +<!-- wikipedia content addon --> +<dependency> + <groupId>org.onehippo.addon.content.wikipedia</groupId> + <artifactId>wikipedia-content</artifactId> + <version>1.0.0</version> + <type>jar</type> +</dependency> + +Thats it. Your done. Please note that the bigger data sets can take a very long time to import. +They may also require more memory to be assigned to the jvm. + + +2. Generating your own jar's +============================= +First we need to generate some content. For this you will need an export from the wikipedia +content. You can download it here: http://en.wikipedia.org/wiki/Wikipedia_database +The 'pages-current.xml.bz2 - Current revisions only, all pages' will do just fine. Extract it. +We will need the *pages-articles.xml file. + +php importer.php <path-to-wikipedia-content-xml> <number-of-articles> + +This will generate a maven project in the target/ folder. In this folder, run + +mvn clean install + +This will package your jar and place it in your repository. Now add your jar to your application +like explained in section 1.2 and see the results. + + +3. Known issues +============================== +- not all wiki syntax is converted to html +- category hierarchy is constructed on any [[Category:...]] link, not just in the parents section Property changes on: attic/forge/wikipediaimport/trunk/README.TXT ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +Id \ No newline at end of property Added: attic/forge/wikipediaimport/trunk/filter.json =================================================================== --- attic/forge/wikipediaimport/trunk/filter.json (rev 0) +++ attic/forge/wikipediaimport/trunk/filter.json 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1 @@ +["Data analysis","Concurrency control","Data management","Computer file systems","Computer storage media","Video storage","Search algorithms","Searching","Library cataloging and classification","Metadata","Information retrieval","Audio storage","Electronic documents","Ontology (computer science)","Semantic Web","Databases","Digital libraries","Film sound production","Sound production technology","Office software","Sound recording","Recording","Internet search engines","Internet search","Data search engines","Data security","Ontology languages","Query languages","Metadata registry","MusicBrainz","Voice technology","Mass digitization","Content management systems","Geographic region-oriented digital libraries","Search engine software","EPrint archives","Open access archives","Discipline-oriented digital libraries","Transaction processing","String similarity measures","Commercial digital libraries","World Digital Library","Document-oriented databases","Corpora","Language-oriente d digita l libraries","Aggregation-based digital libraries","Data warehousing","Web archives","Digital library projects","Data partitioning","Computer-aided software engineering tools","Theatrical sound production","Citation indices","Structured storage","UNESCO nomenclature","Concordances","Library of Congress Classification","Digital library software","Code search engines","Microformats","Ontology editors","Ontology learning (computer science)","ISO\/IEC 11179","Ranking functions","Legal citators","Human edited search engines","Semantic desktop","File systems","Open data"] \ No newline at end of file Added: attic/forge/wikipediaimport/trunk/find_cycles.php =================================================================== --- attic/forge/wikipediaimport/trunk/find_cycles.php (rev 0) +++ attic/forge/wikipediaimport/trunk/find_cycles.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,21 @@ +<? + +include 'source/Category.php'; +include 'source/CategoryCycleDetector.php'; + +Category::restore(); + +$detector = new CategoryCycleDetector(); +$cycles = $detector->getCycles(Category::find()); + +echo "[\n"; +$first = true; +foreach ($cycles as $cycle) { + if (!$first) + echo ",\n"; + else + $first = false; + echo " " . json_encode($cycle); +} +echo "]\n"; + Added: attic/forge/wikipediaimport/trunk/get_category_tree.php =================================================================== --- attic/forge/wikipediaimport/trunk/get_category_tree.php (rev 0) +++ attic/forge/wikipediaimport/trunk/get_category_tree.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,38 @@ +<? + +include_once "source/Category.php"; + +Category::restore(); + +$ancestors = array(); +$ancestors["Metadata"] = true; +$ancestors["Information retrieval"] = true; + +$count = 0; +$categories = array(); +foreach (Category::find() as $category) { + $test = array(); + $test[] = $category->getName(); + foreach ($category->getParents() as $parent) { + $test[] = $parent->getName(); + foreach ($parent->getParents() as $ancestor) { + $test[] = $ancestor->getName(); + } + } + foreach ($test as $ancestor) { + if (isset($ancestors[$ancestor])) { + $categories[] = $category->getName(); + $count += $category->getCount(); + break; + } + } +} + +echo json_encode($categories); + +/* +foreach ($categories as $category) { + echo $category . "\n"; +} +echo "count: " . $count . "\n"; +*/ Added: attic/forge/wikipediaimport/trunk/hash_importer.php =================================================================== --- attic/forge/wikipediaimport/trunk/hash_importer.php (rev 0) +++ attic/forge/wikipediaimport/trunk/hash_importer.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,28 @@ +<?php + +// check if the file exists +if (!file_exists($argv[1])) { + die ("Wikipedia dump file $argv[1] does not exist."); +} +if (!file_exists("filter.json")) { + die ("Filter file filter.json does not exist"); +} + +include_once "source/MavenProject.php"; + +mkdir("target") || die("Could not create folder target"); +define("TARGET", "target"); + +$project = new MavenProject("target"); +$project->create(); + +include_once "source/WikiParser.php"; +include_once "source/CategoryFilter.php"; +include_once "source/JcrHashFolderWriter.php"; + +$parser = new WikiParser(new CategoryFilter(new JcrHashFolderWriter(), json_decode(file_get_contents("filter.json")))); +$parser->setFile($argv[1]); +$parser->setLimit(0); +$parser->parse(); + +/* EOF */ Added: attic/forge/wikipediaimport/trunk/importer.php =================================================================== --- attic/forge/wikipediaimport/trunk/importer.php (rev 0) +++ attic/forge/wikipediaimport/trunk/importer.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,28 @@ +<?php + +// check if the file exists +if (!file_exists($argv[1])){ + die ("File $argv[1] does not exist."); +} + +include_once "source/MavenProject.php"; + +mkdir("target") || die("Could not create folder target"); +define("TARGET", "target"); + +$project = new MavenProject("target"); +$project->create(); + +include_once "source/WikiParser.php"; +include_once "source/JcrDocumentWriter.php"; + +$parser = new WikiParser(new JcrDocumentWriter()); +$parser->setFile($argv[1]); + +$max = 1000; +if (isset($argv[2]) && 0 < (int) $argv[2]){ + $max = (int) $argv[2]; +} +$parser->setLimit($max); + +$parser->parse(); Added: attic/forge/wikipediaimport/trunk/library/jcr/node.php =================================================================== --- attic/forge/wikipediaimport/trunk/library/jcr/node.php (rev 0) +++ attic/forge/wikipediaimport/trunk/library/jcr/node.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,104 @@ +<? + +include_once dirname(__FILE__) . '/value.php'; +include_once dirname(__FILE__) . '/property.php'; + +class Node { + private $parent; + + private $name; + + const JCR_PRIMARY_TYPE = "jcr:primaryType"; + const JCR_MIXIN_TYPES = "jcr:mixinTypes"; + + /* package */ public $_nodes = array(); + /* package */ public $_properties = array(); + private $_mixins = array(); + + public function __construct($name, $type) { + $this->name = $name; + $this->setProperty(self::JCR_PRIMARY_TYPE, value::newName($type)); + } + + public function getName() { + return $this->name; + } + + public function getPrimaryNodeType() { + return $this->getProperty(self::JCR_PRIMARY_TYPE)->getValue()->getString(); + } + + public function getIndex() { + $siblings = $this->parent->_nodes[$this->name]; + $i = 0; + foreach ($siblings as $sibling) { + if ($sibling == $this) { + return $i; + } + $i++; + } + return $i; + } + + public function getProperty($name) { + return $this->_properties[$name]; + } + + public function setProperty($name, $value) { + $this->_properties[$name] = new property($this, $name, $value); + } + + public function addMixin($name) { + if (!isset($this->_properties[self::JCR_MIXIN_TYPES])) { + $this->setProperty(self::JCR_MIXIN_TYPES, array()); + } + $values = $this->getProperty(self::JCR_MIXIN_TYPES)->getValues(); + $values[] = value::newName($name); + $this->setProperty(self::JCR_MIXIN_TYPES, $values); + } + + public function addNode($name, $type) { + $child = new Node($name, $type); + return $this->addChild($child); + } + + public function addChild(Node $child) { + $name = $child->name; + if (!isset($this->_nodes[$name])) { + $this->_nodes[$name] = array(); + } + $this->_nodes[$name][] = $child; + $child->parent = $this; + return $child; + } + + public function getNodes($name = null) { + if ($name == null) { + $arr = array(); + foreach ($this->_nodes as $name => $siblings) { + $arr = array_merge($arr, $siblings); + } + } else { + $arr = $this->_nodes[$name]; + } + return new ArrayIterator($arr); + } + + public function toSystemView($root = true, $indent = "") { + if ($root) { + $result = "<?xml version=\"1.0\"?>\n"; + $result .= "<sv:node sv:name=\"{$this->name}\" xmlns:sv=\"http://www.jcp.org/jcr/sv/1.0\">\n"; + $indent = ""; + } else { + $result = $indent . "<sv:node sv:name=\"{$this->name}\">\n"; + } + foreach ($this->_properties as $name => $property) { + $result .= $property->toSystemView(false, $indent . " "); + } + foreach ($this->getNodes() as $child) { + $result .= $child->toSystemView(false, $indent . " "); + } + $result .= $indent . "</sv:node>\n"; + return $result; + } +} Added: attic/forge/wikipediaimport/trunk/library/jcr/parser.php =================================================================== --- attic/forge/wikipediaimport/trunk/library/jcr/parser.php (rev 0) +++ attic/forge/wikipediaimport/trunk/library/jcr/parser.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,77 @@ +<? + +include_once dirname(__FILE__) . "/node.php"; + +/** + * A parser for JCR system-view XML exports. + * Constructs a Node object that correponds with the content. + */ +class JcrXmlParser { + + public function parse($text) { + if ($text == null || trim($text) == "") { + throw new Exception("No text provided for parsing"); + } + $xmlDom = new DOMDocument(); + $xmlDom->loadXML($text); + + return $this->parseNode($xmlDom->firstChild); + } + + private function parseNode($xmlNode) { + $props = $this->getProperties($xmlNode); + $values = $this->getPropertyValues($props["jcr:primaryType"]); + $node = new Node($xmlNode->getAttribute("sv:name"), $values[0]); + foreach ($props as $name => $xmlProp) { + if ($name == "jcr:primaryType") + continue; + $values = $this->getPropertyValues($xmlProp); + if (count($values) == 0) + continue; + if (count($values) > 1) + $node->setProperty($name, $values); + else + $node->setProperty($name, $values[0]); + } + + $nodes = $this->getNodes($xmlNode); + foreach ($nodes as $childXmlNode) { + $node->addChild($this->parseNode($childXmlNode)); + } + return $node; + } + + private function getNodes($xmlNode) { + $nodes = array(); + for ($item = $xmlNode->firstChild; $item != null; $item = $item->nextSibling) { + if ($item->nodeType != XML_ELEMENT_NODE) + continue; + if ($item->nodeName == "sv:node") + $nodes[$item->getAttribute("sv:name")] = $item; + } + return $nodes; + } + + private function getProperties($xmlNode) { + $props = array(); + for ($item = $xmlNode->firstChild; $item != null; $item = $item->nextSibling) { + if ($item->nodeType != XML_ELEMENT_NODE) + continue; + if ($item->nodeName == "sv:property") + $props[$item->getAttribute("sv:name")] = $item; + } + return $props; + } + + private function getPropertyValues($xmlProp) { + $values = array(); + for ($item = $xmlProp->firstChild; $item != null; $item = $item->nextSibling) { + if ($item->nodeType != XML_ELEMENT_NODE) + continue; + if ($item->nodeName == "sv:value") + $values[] = $item->firstChild->textContent; + } + return $values; + } + +} Added: attic/forge/wikipediaimport/trunk/library/jcr/property.php =================================================================== --- attic/forge/wikipediaimport/trunk/library/jcr/property.php (rev 0) +++ attic/forge/wikipediaimport/trunk/library/jcr/property.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,85 @@ +<? + +include_once dirname(__FILE__) . '/value.php'; + +class property { + private $parent; + + private $data; + private $name; + + public function __construct($parent, $name, $data) { + $this->parent = $parent; + $this->name = $name; + if (is_array($data)) { + $this->setValues($data); + } else { + $this->setValue($data); + } + } + + public function getValue() { + if (is_array($this->data)) { + throw new Exception("Multi-valued property"); + } + return $this->data; + } + + public function getValues() { + if (!is_array($this->data)) { + throw new Exception("Single-valued property"); + } + return $this->data; + } + + public function setValue($value) { + if (is_a($value, "value")) { + $this->data = $value; + } else { + $this->data = value::newString($value); + } + } + + public function setValues($values) { + $this->data = array(); + foreach ($values as $value) { + if (is_a($value, "value")) { + $this->data[] = $value; + } else { + $this->data[] = value::newString($value); + } + } + } + + public function getType() { + if (is_array($this->data)) { + if (count($this->data) > 0) { + return $this->data[0]->getType(); + } else { + return "String"; + } + } + return $this->data->getType(); + } + + public function remove() { + unset($parent->_properties[$this->name]); + } + + public function toSystemView($indent = "") { + $result = $indent . "<sv:property sv:name=\"{$this->name}\" sv:type=\"{$this->getType()}\">\n"; + if (is_array($this->data)) { + foreach ($this->data as $value) { + $result .= $indent . " <sv:value>" . $this->encode($value->getString()) . "</sv:value>\n"; + } + } else { + $result .= $indent . " <sv:value>" . $this->encode($this->data->getString()) . "</sv:value>\n"; + } + $result .= $indent . "</sv:property>\n"; + return $result; + } + + private function encode($value) { + return htmlspecialchars($value, ENT_COMPAT, 'UTF-8'); + } +} Added: attic/forge/wikipediaimport/trunk/library/jcr/value.php =================================================================== --- attic/forge/wikipediaimport/trunk/library/jcr/value.php (rev 0) +++ attic/forge/wikipediaimport/trunk/library/jcr/value.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,44 @@ +<? + +class value { + private $type; + private $value; + + public function __construct($type, $value) { + $this->type = $type; + $this->value = $value; + } + + public function getType() { + return $this->type; + } + + public function getString() { + return $this->value; + } + + public function toSystemView($value) { + return $value; + } + + public static function newString($value) { + return new value("String", $value); + } + + public static function newDouble($value) { + return new value("Double", $value); + } + + public static function newLong($value) { + return new value("Long", $value); + } + + public static function newDate($value) { + return new value("Date", $value); + } + + public static function newName($value) { + return new value("Name", $value); + } + +} Added: attic/forge/wikipediaimport/trunk/library/wiki/parseRaw.inc.php =================================================================== --- attic/forge/wikipediaimport/trunk/library/wiki/parseRaw.inc.php (rev 0) +++ attic/forge/wikipediaimport/trunk/library/wiki/parseRaw.inc.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,134 @@ +<?php + +include(dirname(__FILE__) . "/table-converter.inc.php"); + +function getPartBetween($str, $a, $b){ + $start = strpos($str,$a) + strlen($a); + if(strpos($str,$a) === false) return false; + $length = strpos($str,$b,$start) - $start; + if(strpos($str,$b,$start) === false) return false; + return substr($str,$start,$length); +} +function debug_preg($matches){ + echo "\n\n<h3 style='color=red'>PREG</h3><pre>\n\n"; + var_dump($matches); + echo "\n\n<hr style='color=red' />n\n"; + return $matches[0]; +} +function simpleText($html){ + + $html = str_replace('–','-',$html); + $html = str_replace('"','"',$html); + $html = preg_replace('/\&(nbsp);/','&${1};',$html); + + //formatting + // bold + $html = preg_replace('/\'\'\'([^\n\']+)\'\'\'/','<strong>${1}</strong>',$html); + // emphasized + $html = preg_replace('/\'\'([^\'\n]+)\'\'?/','<em>${1}</em>',$html); + //interwiki links + $html = preg_replace_callback('/\[\[([^\|\n\]:]+)[\|]([^\]]+)\]\]/','helper_interwikilinks',$html); + // without text + $html = preg_replace_callback('/\[\[([^\|\n\]:]+)\]\]/','helper_interwikilinks',$html); + // + //$html = preg_replace('/{{([^}]+)+}}/','Interwiki: ${1}+${2}+${3}',$html); + $html = preg_replace('/{{([^\|\n\}]+)([\|]?([^\}]+))+\}\}/','Interwiki: ${1} » ${3}',$html); + // Template + //$html = preg_replace('/{{([^}]*)}}/',' ',$html); + // categories + //$html = preg_replace('/\[\[([^\|\n\]]+)([\|]([^\]]+))?\]\]/','',$html); + $html = preg_replace('/\[\[([^\|\n\]]{2})([\:]([^\]]+))?\]\]/','Translation: ${1} » ${3}',$html); + $html = preg_replace('/\[\[([^\|\n\]]+)([\:]([^\]]+))?\]\]/','Category: ${1} - ${2}',$html); + // image + $html = preg_replace('/\[\[([^\|\n\]]+)([\|]([^\]]+))+\]\]/','Image: ${0}+${1}+${2}+${3}',$html); + + //links + //$html = preg_replace('/\[([^\[\]\|\n\': ]+)\]/','<a href="${1}">${1}</a>',$html); + $html = preg_replace_callback('/\[([^\[\]\|\n\': ]+)\]/','helper_externlinks',$html); + // with text + //$html = preg_replace('/\[([^\[\]\|\n\' ]+)[\| ]([^\]\']+)\]/','<a href="${1}">${2}</a>',$html); + $html = preg_replace_callback('/\[([^\[\]\|\n\' ]+)[\| ]([^\]\']+)\]/','helper_externlinks',$html); + + // allowed tags + $html = preg_replace('/<(\/?)(small|sup|sub|u)>/','<${1}${2}>',$html); + + $html = preg_replace('/\n*<br *\/?>\n*/',"\n",$html); + $html = preg_replace('/<(\/?)(math|pre|code|nowiki)>/','<${1}pre>',$html); + $html = preg_replace('/<!--/','<!--',$html); + $html = preg_replace('/-->/',' -->',$html); + + // headings + for($i=7;$i>0;$i--){ + $html = preg_replace( + '/\n+[=]{'.$i.'}([^=]+)[=]{'.$i.'}\n*/', + '<h'.$i.'>${1}</h'.$i.'>', + $html + ); + } + + //lists + $html = preg_replace( + '/(\n[ ]*[^#* ][^\n]*)\n(([ ]*[*]([^\n]*)\n)+)/', + '${1}<ul>'."\n".'${2}'.'</ul>'."\n", + $html + ); + $html = preg_replace( + '/(\n[ ]*[^#* ][^\n]*)\n(([ ]*[#]([^\n]*)\n)+)/', + '${1}<ol>'."\n".'${2}'.'</ol>'."\n", + $html + ); + $html = preg_replace('/\n[ ]*[\*#]+([^\n]*)/','<li>${1}</li>',$html); + + $html = preg_replace('/----/','<hr />',$html); + + //$html = nl2br($html); + // line breaks + $html = preg_replace('/[\n\r]{4}/',"<br/><br/>",$html); + $html = preg_replace('/[\n\r]{2}/',"<br/>",$html); + + $html = preg_replace('/[>]<br\/>[<]/',"><",$html); + + return $html; +} +function parseRaw($title,$page){ + //$text = (getPartBetween($page, '<text xml:space="preserve">', '</text>')); + $html = $page; + //echo "<!-- " . wordwrap($text,120,"\n",1) . " -->"; + // re-html + $html = html_entity_decode($html); + $html = str_replace('–','-',$html); + $html = str_replace('"','"',$html); + $html = preg_replace('/\&(nbsp);/','&${1};',$html); + + + $html = str_replace('{{PAGENAME}}',$title,$html); + + // Table + $html = convertTables($html); + + $html = simpleText($html); + return $html; +} +function giveSource($page){ + $text = (getPartBetween($page, '<text xml:space="preserve">', '</text>')); + $text = "<pre>".$text."</pre>"; + return $text; +} +function helper_externlinks($matches){ + $target = $matches[1]; + $text = empty($matches[2])?$matches[1]:$matches[2]; + return '<a href="'.$target.'">'.$text.'</a>'; +} +function helper_interwikilinks($matches){ + $target = $matches[1]; + $text = empty($matches[2])?$matches[1]:$matches[2]; + $class=" class=\"dunno\" "; + /*static $links_checked_interwiki = 0; + if(!$_GET["nocache"] && ++$links_checked_interwiki<10){ + $data = cachedFunc("getPos",$target); + if($data["pos"]) $class = " class=\"exists\" "; $class = " class=\"notexists\" "; + }*/ + return '<a '.$class.' href="?page='.$target.'">'.$text.'</a>'; +} + +?> Added: attic/forge/wikipediaimport/trunk/library/wiki/table-converter.inc.php =================================================================== --- attic/forge/wikipediaimport/trunk/library/wiki/table-converter.inc.php (rev 0) +++ attic/forge/wikipediaimport/trunk/library/wiki/table-converter.inc.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,104 @@ +<?php + +function convertTables($text){ + $lines = explode("\n",$text); + $innertable = 0; + $innertabledata = array(); + foreach($lines as $line){ + //echo "<pre>".++$i.": ".htmlspecialchars($line)."</pre>"; + $line = str_replace("position:relative","",$line); + $line = str_replace("position:absolute","",$line); + if(substr($line,0,2) == '{|'){ + // inner table + //echo "<p>beginning inner table #$innertable</p>"; + $innertable++; + } + $innertabledata[$innertable] .= $line . "\n"; + if($innertable){ + // we're inside + if(substr($line,0,2) == '|}'){ + $innertableconverted = convertTable($innertabledata[$innertable]); + $innertabledata[$innertable] = ""; + $innertable--; + $innertabledata[$innertable] .= $innertableconverted."\n"; + } + } + } + return $innertabledata[0]; +} +function convertTable($intext){ + $text = $intext; + $lines = explode("\n",$text); + $intable = false; + + //var_dump($lines); + foreach($lines as $line){ + $line = trim($line); + if(substr($line,0,1) == '{'){ + //begin of the table + $stuff = explode('| ',substr($line,1),2); + $tableopen = true; + $table = "<table ".$stuff[0].">\n"; + } else if(substr($line,0,1) == '|'){ + // table related + $line = substr($line,1); + if(substr($line,0,5) == '-----'){ + // row break + if($thopen) + $table .="</th>\n"; + if($tdopen) + $table .="</td>\n"; + if($rowopen) + $table .="\t</tr>\n"; + $table .= "\t<tr>\n"; + $rowopen = true; + $tdopen = false; + $thopen = false; + }else if(substr($line,0,1) == '}'){ + // table end + break; + }else{ + // td + $stuff = explode('| ',$line,2); + if($tdopen) + $table .="</td>\n"; + if(count($stuff)==1) + $table .= "\t\t<td>".simpleText($stuff[0]); + else + $table .= "\t\t<td ".$stuff[0].">". + simpleText($stuff[1]); + $tdopen = true; + } + } else if(substr($line,0,1) == '!'){ + // th + $stuff = explode('| ',substr($line,1),2); + if($thopen) + $table .="</th>\n"; + if(count($stuff)==1) + $table .= "\t\t<th>".simpleText($stuff[0]); + else + $table .= "\t\t<th ".$stuff[0].">". + simpleText($stuff[1]); + $thopen = true; + }else{ + // plain text + $table .= simpleText($line) ."\n"; + } + //echo "<pre>".++$i.": ".htmlspecialchars($line)."</pre>"; + //echo "<p>Table so far: <pre>".htmlspecialchars($table)."</pre></p>"; + } + if($thopen) + $table .="</th>\n"; + if($tdopen) + $table .="</td>\n"; + if($rowopen) + $table .="\t</tr>\n"; + if($tableopen) + $table .="</table>\n"; + //echo "<hr />"; + //echo "<p>Table at the end: <pre>".htmlspecialchars($table)."</pre></p>"; + //echo $table; + return $table; +} + +?> Added: attic/forge/wikipediaimport/trunk/list_top_categories.php =================================================================== --- attic/forge/wikipediaimport/trunk/list_top_categories.php (rev 0) +++ attic/forge/wikipediaimport/trunk/list_top_categories.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,39 @@ +<?php + +include dirname(__FILE__) . "/source/Category.php"; +include dirname(__FILE__) . "/source/OrderedList.php"; + +// Category::restore("categories_test.json"); +Category::restore(); + +$list = new OrderedList(); +foreach (Category::find() as $category) { + $list->add($category); +} + +echo "[\n"; +$first = true; + +for ($entry = $list->getHead(); $entry != null; $entry = $entry->next) { + $category = $entry->entry; + if (!$first) + echo ",\n"; + else + $first = false; + if ($category->getParents() != null) { + $parents = array(); + foreach ($category->getParents() as $parent) { + $parents[] = $parent->getName(); + } + + echo json_encode( + array("name" => $category->getName(), + "count" => $category->getCount(), + "parents" => $parents)); + } else { + echo json_encode( + array("name" => $category->getName(), + "count" => $category->getCount())); + } +} +echo "]\n"; Added: attic/forge/wikipediaimport/trunk/listcategories.php =================================================================== --- attic/forge/wikipediaimport/trunk/listcategories.php (rev 0) +++ attic/forge/wikipediaimport/trunk/listcategories.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,40 @@ +<?php + +// check if the file exists +if (!file_exists($argv[1])){ + die ("File $argv[1] does not exist."); +} + +include_once "source/WikiParser.php"; +include_once "source/CategoryHandler.php"; + +$counter = new CategoryHandler(); +$parser = new WikiParser($counter); +$parser->setFile($argv[1]); + +$max = 1000; +if (isset($argv[2]) && 0 <= (int) $argv[2]){ + $max = (int) $argv[2]; +} +$parser->setLimit($max); +$parser->setShowProgress(false); + +$parser->parse(); + +Category::save(); + +/* +echo "[\n"; +$first = true; +foreach($counter->getCategories() as $category => $count) { + if ($first) { + $first = false; + } else { + echo ",\n"; + } + echo json_encode(array("category" => $category, "count" => $count)); +} +echo "\n]"; +*/ + +/* EOF */ Added: attic/forge/wikipediaimport/trunk/resources/pom.xml =================================================================== --- attic/forge/wikipediaimport/trunk/resources/pom.xml (rev 0) +++ attic/forge/wikipediaimport/trunk/resources/pom.xml 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,68 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Copyright 2007-2008 Hippo. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <name>Wikipedia content</name> + <description>Wikipedia demo content</description> + <groupId>org.onehippo.addon.content.wikipedia</groupId> + <artifactId>wikipedia-content</artifactId> + <version>1.0.0</version> + <packaging>jar</packaging> + + <build> + <defaultGoal>install</defaultGoal> + <resources> + <resource> + <directory>${basedir}/src/main/resources</directory> + <targetPath>.</targetPath> + <includes> + <include>*.xml</include> + <include>*.cnd</include> + </includes> + </resource> + </resources> + <extensions> + <extension> + <groupId>org.jvnet.wagon-svn</groupId> + <artifactId>wagon-svn</artifactId> + <version>1.9</version> + </extension> + </extensions> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <source>1.5</source> + <target>1.5</target> + </configuration> + </plugin> + </plugins> + </build> + + <distributionManagement> + <repository> + <uniqueVersion>false</uniqueVersion> + <id>wikipediaimport-maven-repo</id> + <url>svn:http://forge.hippo-ecm.org/svn/wikipediaimport/maven2/</url> + </repository> + </distributionManagement> + +</project> Property changes on: attic/forge/wikipediaimport/trunk/resources/pom.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +Id \ No newline at end of property Added: attic/forge/wikipediaimport/trunk/resources/wikipedia-data.xml =================================================================== --- attic/forge/wikipediaimport/trunk/resources/wikipedia-data.xml (rev 0) +++ attic/forge/wikipediaimport/trunk/resources/wikipedia-data.xml 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,59 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Copyright 2007-2009 Hippo + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" + BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<sv:node xmlns:sv="http://www.jcp.org/jcr/sv/1.0" sv:name="content"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hippostd:folder</sv:value> + </sv:property> + <sv:property sv:name="jcr:mixinTypes" sv:type="Name"> + <sv:value>hippo:harddocument</sv:value> + </sv:property> + <sv:property sv:name="hippo:paths" sv:type="String"> + </sv:property> + <sv:property sv:name="hippo:related___pathreference" sv:type="String"> + </sv:property> + <sv:property sv:name="hippostd:foldertype" sv:type="String"> + <sv:value>new-folder</sv:value> + </sv:property> + <sv:node sv:name="documents"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hippostd:folder</sv:value> + </sv:property> + <sv:property sv:name="jcr:mixinTypes" sv:type="Name"> + <sv:value>hippo:harddocument</sv:value> + </sv:property> + <sv:property sv:name="hippo:paths" sv:type="String"> + </sv:property> + <sv:property sv:name="hippo:related___pathreference" sv:type="String"> + </sv:property> + <sv:property sv:name="hippostd:foldertype" sv:type="String"> + <sv:value>new-folder</sv:value> + </sv:property> + <sv:node sv:name="wikipedia"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hippostd:folder</sv:value> + </sv:property> + <sv:property sv:name="jcr:mixinTypes" sv:type="Name"> + <sv:value>hippo:harddocument</sv:value> + </sv:property> + <sv:property sv:name="hippostd:foldertype" sv:type="String"> + <sv:value>new-document</sv:value> + <sv:value>new-folder</sv:value> + </sv:property> + </sv:node> + </sv:node> +</sv:node> + Property changes on: attic/forge/wikipediaimport/trunk/resources/wikipedia-data.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +Id \ No newline at end of property Added: attic/forge/wikipediaimport/trunk/resources/wikipedia-namespace.xml =================================================================== --- attic/forge/wikipediaimport/trunk/resources/wikipedia-namespace.xml (rev 0) +++ attic/forge/wikipediaimport/trunk/resources/wikipedia-namespace.xml 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,221 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Copyright 2007 Hippo + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" + BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<sv:node sv:name="wikipedia" + xmlns:sv="http://www.jcp.org/jcr/sv/1.0"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hipposysedit:namespace</sv:value> + </sv:property> + <sv:property sv:name="jcr:mixinTypes" sv:type="Name"> + <sv:value>mix:referenceable</sv:value> + </sv:property> + <sv:property sv:name="jcr:uuid" sv:type="String"> + <sv:value>0570e669-aadb-42e4-8d66-acdd31d39371</sv:value> + </sv:property> + <sv:node sv:name="article"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hipposysedit:templatetype</sv:value> + </sv:property> + <sv:property sv:name="jcr:mixinTypes" sv:type="Name"> + <sv:value>mix:referenceable</sv:value> + <sv:value>editor:editable</sv:value> + </sv:property> + <sv:property sv:name="jcr:uuid" sv:type="String"> + <sv:value>788ed814-f9ee-4b58-968c-d0b59763d30c</sv:value> + </sv:property> + <sv:node sv:name="hipposysedit:nodetype"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hippo:handle</sv:value> + </sv:property> + <sv:property sv:name="jcr:mixinTypes" sv:type="Name"> + <sv:value>hippo:hardhandle</sv:value> + </sv:property> + <sv:property sv:name="jcr:uuid" sv:type="String"> + <sv:value>f47b9324-8174-4c89-b9c2-c32ec95f7429</sv:value> + </sv:property> + <sv:node sv:name="hipposysedit:nodetype"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hipposysedit:nodetype</sv:value> + </sv:property> + <sv:property sv:name="jcr:mixinTypes" sv:type="Name"> + <sv:value>hipposysedit:remodel</sv:value> + <sv:value>hippo:harddocument</sv:value> + </sv:property> + <sv:property sv:name="jcr:uuid" sv:type="String"> + <sv:value>ac873d39-0817-4dac-bc8c-87603e5ec91c</sv:value> + </sv:property> + <sv:property sv:name="hipposysedit:node" sv:type="Boolean"> + <sv:value>true</sv:value> + </sv:property> + <sv:property sv:name="hipposysedit:supertype" sv:type="String"> + <sv:value>hippo:document</sv:value> + <sv:value>hippostd:publishable</sv:value> + <sv:value>hippostd:publishableSummary</sv:value> + </sv:property> + <sv:property sv:name="hipposysedit:uri" sv:type="String"> + <sv:value>http://forge.onehippo.org/wikipedia/nt/1.0</sv:value> + </sv:property> + <sv:node sv:name="title"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hipposysedit:field</sv:value> + </sv:property> + <sv:property sv:name="hipposysedit:path" sv:type="String"> + <sv:value>wikipedia:title</sv:value> + </sv:property> + <sv:property sv:name="hipposysedit:type" sv:type="String"> + <sv:value>String</sv:value> + </sv:property> + </sv:node> + <sv:node sv:name="body"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hipposysedit:field</sv:value> + </sv:property> + <sv:property sv:name="hipposysedit:path" sv:type="String"> + <sv:value>wikipedia:html</sv:value> + </sv:property> + <sv:property sv:name="hipposysedit:type" sv:type="String"> + <sv:value>Html</sv:value> + </sv:property> + </sv:node> + </sv:node> + </sv:node> + <sv:node sv:name="hipposysedit:prototypes"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>hipposysedit:prototypeset</sv:value> + </sv:property> + <sv:node sv:name="hipposysedit:prototype"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>wikipedia:article</sv:value> + </sv:property> + <sv:property sv:name="jcr:mixinTypes" sv:type="Name"> + <sv:value>hippo:harddocument</sv:value> + </sv:property> + <sv:property sv:name="hippostd:state" sv:type="String"> + <sv:value>unpublished</sv:value> + </sv:property> + <sv:property sv:name="hippostd:stateSummary" sv:type="String"> + <sv:value>new</sv:value> + </sv:property> + <sv:property sv:name="wikipedia:html" sv:type="String"> + <sv:value/> + </sv:property> + <sv:property sv:name="wikipedia:title" sv:type="String"> + <sv:value>Title</sv:value> + </sv:property> + </sv:node> + </sv:node> + <sv:node sv:name="editor:templates"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>editor:templateset</sv:value> + </sv:property> + <sv:node sv:name="_default_"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>frontend:plugincluster</sv:value> + </sv:property> + <sv:property sv:name="frontend:references" sv:type="String"> + <sv:value>engine</sv:value> + <sv:value>wicket.model</sv:value> + </sv:property> + <sv:property sv:name="frontend:services" sv:type="String"> + <sv:value>wicket.id</sv:value> + <sv:value>wicket.dialog</sv:value> + <sv:value>wicket.model</sv:value> + </sv:property> + <sv:property sv:name="frontend:properties" sv:type="String"> + <sv:value>mode</sv:value> + </sv:property> + <sv:property sv:name="mode" sv:type="String"> + <sv:value>edit</sv:value> + </sv:property> + <sv:node sv:name="root"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>frontend:plugin</sv:value> + </sv:property> + <sv:property sv:name="item" sv:type="String"> + <sv:value>${cluster.id}.field</sv:value> + </sv:property> + <sv:property sv:name="plugin.class" sv:type="String"> + <sv:value>org.hippoecm.frontend.service.render.ListViewPlugin</sv:value> + </sv:property> + <sv:property sv:name="wicket.id" sv:type="String"> + <sv:value>${wicket.id}</sv:value> + </sv:property> + </sv:node> + <sv:node sv:name="title"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>frontend:plugin</sv:value> + </sv:property> + <sv:property sv:name="caption" sv:type="String"> + <sv:value>Title</sv:value> + </sv:property> + <sv:property sv:name="engine" sv:type="String"> + <sv:value>${engine}</sv:value> + </sv:property> + <sv:property sv:name="field" sv:type="String"> + <sv:value>title</sv:value> + </sv:property> + <sv:property sv:name="mode" sv:type="String"> + <sv:value>${mode}</sv:value> + </sv:property> + <sv:property sv:name="plugin.class" sv:type="String"> + <sv:value>org.hippoecm.frontend.editor.plugins.field.PropertyFieldPlugin</sv:value> + </sv:property> + <sv:property sv:name="template.size" sv:type="String"> + <sv:value/> + </sv:property> + <sv:property sv:name="wicket.id" sv:type="String"> + <sv:value>${cluster.id}.field</sv:value> + </sv:property> + <sv:property sv:name="wicket.model" sv:type="String"> + <sv:value>${wicket.model}</sv:value> + </sv:property> + </sv:node> + <sv:node sv:name="body"> + <sv:property sv:name="jcr:primaryType" sv:type="Name"> + <sv:value>frontend:plugin</sv:value> + </sv:property> + <sv:property sv:name="caption" sv:type="String"> + <sv:value>Body</sv:value> + </sv:property> + <sv:property sv:name="engine" sv:type="String"> + <sv:value>${engine}</sv:value> + </sv:property> + <sv:property sv:name="field" sv:type="String"> + <sv:value>body</sv:value> + </sv:property> + <sv:property sv:name="mode" sv:type="String"> + <sv:value>${mode}</sv:value> + </sv:property> + <sv:property sv:name="plugin.class" sv:type="String"> + <sv:value>org.hippoecm.frontend.editor.plugins.field.PropertyFieldPlugin</sv:value> + </sv:property> + <sv:property sv:name="template.height" sv:type="String"> + <sv:value/> + </sv:property> + <sv:property sv:name="template.width" sv:type="String"> + <sv:value/> + </sv:property> + <sv:property sv:name="wicket.id" sv:type="String"> + <sv:value>${cluster.id}.field</sv:value> + </sv:property> + <sv:property sv:name="wicket.model" sv:type="String"> + <sv:value>${wicket.model}</sv:value> + </sv:property> + </sv:node> + </sv:node> + </sv:node> + </sv:node> +</sv:node> Property changes on: attic/forge/wikipediaimport/trunk/resources/wikipedia-namespace.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +Id \ No newline at end of property Added: attic/forge/wikipediaimport/trunk/resources/wikipedia.cnd =================================================================== --- attic/forge/wikipediaimport/trunk/resources/wikipedia.cnd (rev 0) +++ attic/forge/wikipediaimport/trunk/resources/wikipedia.cnd 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,23 @@ +/* + * Copyright 2008 Hippo + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<hippo='http://www.onehippo.org/jcr/hippo/nt/2.0'> +<hippostd='http://www.onehippo.org/jcr/hippostd/nt/2.0'> +<wikipedia='http://forge.onehippo.org/wikipedia/nt/1.0'> + +[wikipedia:article] > hippo:document, hippostd:publishable, hippostd:publishableSummary, hippostd:taggable +- wikipedia:title (string) +- wikipedia:html (string) Added: attic/forge/wikipediaimport/trunk/run_tests.php =================================================================== --- attic/forge/wikipediaimport/trunk/run_tests.php (rev 0) +++ attic/forge/wikipediaimport/trunk/run_tests.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,35 @@ +<? + +include_once "source/TestCase.php"; + +$tests = null; +if (count($argv) > 1) { + $tests = $argv; + array_shift($tests); +} + +echo "Starting tests\n"; +$suite = opendir("test"); +while ($entry = readdir($suite)) { + if (substr($entry, 0, 1) == ".") + continue; + + $clazzName = substr($entry, 0, strrpos($entry, ".")); + if ($tests != null && !in_array($clazzName, $tests)) + continue; + + include "test/" . $entry; + if (!class_exists($clazzName)) { + continue; + } + + $clazz = new ReflectionClass($clazzName); + if ($clazz->isSubclassOf("TestCase")) { + echo "Running {$clazz->name}\n"; + $test = $clazz->newInstance(); + $test->run(); + } +} +closedir($suite); +echo "Done\n"; + Added: attic/forge/wikipediaimport/trunk/source/Category.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/Category.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/Category.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,116 @@ +<?php + +define("STATUS_HOLLOW", 0); +define("STATUS_LOADED", 1); + +class Category { + private static $categories = array(); + + private $parents; + private $name; + private $count = 0; + + private function __construct(&$name, &$parents = null) { + $this->name = &$name; + $this->parents = &$parents; + } + + public function getName() { + return $this->name; + } + + public function getParents() { + if ($this->parents == null) { + return array(); + } + return $this->parents; + } + + public function getStatus() { + if ($this->parents == null) { + return STATUS_HOLLOW; + } else { + return STATUS_LOADED; + } + } + + public function setParents(&$parents) { + $this->parents = $parents; + } + + public function getCount() { + return $this->count; + } + + public function addDocument($document) { + $this->count++; + } + + public static function find() { + return new ArrayIterator(self::$categories); + } + + public static function load($name) { + if (!isset(self::$categories[$name])) { + self::$categories[$name] = new Category($name); + } + return self::$categories[$name]; + } + + public static function save($file = "categories.json") { + $fd = fopen($file, "w"); + fwrite($fd, "[\n"); + $first = true; + foreach (self::$categories as $category) { + if ($first) { + $first = false; + } else { + fwrite($fd, ",\n"); + } + if ($category->getParents() != null) { + $parents = array(); + foreach ($category->getParents() as $parent) { + $parents[] = $parent->name; + } + fwrite($fd, json_encode(array("name" => $category->name, "count" => $category->count, "parents" => $parents))); + } else { + fwrite($fd, json_encode(array("name" => $category->name, "count" => $category->count))); + } + } + fwrite($fd, "\n]\n"); + fclose($fd); + } + + public static function restore($file = "categories.json") { + $fd = fopen($file, "r"); + $lines = 0; + $last = false; + while (!feof($fd) && !$last) { + $linebuffer = trim(fgets($fd)); + if ($lines == 0) { + $lines++; + continue; + } + if (substr($linebuffer, strlen($linebuffer) - 1) != ",") { + $last = true; + $json = $linebuffer; + } else { + $json = substr($linebuffer, 0, strlen($linebuffer) - 1); + } + $obj = json_decode($json); + $category = self::load($obj->name); + $category->count = $obj->count; + if (isset($obj->parents)) { + $parents = $obj->parents; + $parentCats = array(); + foreach ($parents as $parent) { + $parentCats[] = self::load($parent); + } + $category->parents = $parentCats; + } + } + fclose($fd); + } + +} + Added: attic/forge/wikipediaimport/trunk/source/CategoryCycleDetector.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/CategoryCycleDetector.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/CategoryCycleDetector.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,55 @@ +<? + +class CategoryCycleDetector { + private $visited = array(); + private $stack = array(); + private $cycles = array(); + + public $debug; + + public function __construct($debug = false) { + $this->debug = $debug; + } + + public function getCycles($categories) { + foreach ($categories as $category) { + $this->visit($category); + } + return $this->cycles; + } + + private function visit($category) { + $name = $category->getName(); + + if ($this->debug) + echo json_encode($this->stack) . ",\n"; + if (in_array($name, $this->stack)) { + $cycle = array(); + $in_cycle = false; + foreach ($this->stack as $cat) { + if ($in_cycle) { + $cycle[] = $cat; + } else if ($cat == $name) { + $in_cycle = true; + $cycle[] = $cat; + } + } + $this->cycles[] = $cycle; + return; + } + + if (isset($this->visited[$name])) { + return; + } + + $this->visited[$name] = true; + array_push($this->stack, $name); + foreach ($category->getParents() as $parent) { + if ($this->debug) + echo $name . ": ". $parent->getName() . "\n"; + $this->visit($parent); + } + array_pop($this->stack); + } + +} Added: attic/forge/wikipediaimport/trunk/source/CategoryFilter.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/CategoryFilter.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/CategoryFilter.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,30 @@ +<? + +include_once dirname(__FILE__) . "/IWikiHandler.php"; + +class CategoryFilter implements IWikiHandler { + private $upstream; + private $categories; + + public function __construct($upstream, $categories) { + $this->upstream = $upstream; + $this->categories = array(); + foreach ($categories as $category) { + $this->categories[$category] = true; + } + } + + public function handle(WikiDocument $document) { + if ($document->getCategories() == null) + return false; + foreach ($document->getCategories() as $category) { + if (isset($this->categories[$category])) { + return $this->upstream->handle($document); + } + } + } + + public function close() { + $this->upstream->close(); + } +} Added: attic/forge/wikipediaimport/trunk/source/CategoryHandler.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/CategoryHandler.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/CategoryHandler.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,40 @@ +<?php + +include_once dirname(__FILE__) . "/Category.php"; + +class CategoryHandler implements IWikiHandler { + + public function handle(WikiDocument $document) { + foreach ($document->getCategories() as $category) { + Category::load($category)->addDocument($document); + } + + if ($this->matchCategory($document, &$categoryName)) { + $this->addCategory($categoryName, $document->getCategories()); + } + return true; + } + + public function close() { + } + + private function matchCategory($document, &$categoryName) { + $name = $document->getName(); + if (preg_match("/^Category:/", $name, &$matches)) { + $categoryName = substr($name, 9, strlen($name) - 9); + return true; + } + return false; + } + + private function addCategory($name, $parents) { + $parentCats = array(); + foreach($parents as $parent) { + $parentCats[] = Category::load($parent); + } + $category = Category::load($name); + $category->setParents($parentCats); + } + +} + Added: attic/forge/wikipediaimport/trunk/source/HippoExtensionWriter.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/HippoExtensionWriter.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/HippoExtensionWriter.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,45 @@ +<?php + +include_once "library/jcr/node.php"; + +class HippoExtensionWriter { + + private $folder; + + public function __construct() { + if (!defined("TARGET")) { + die("No TARGET defined"); + } + + $this->folder = new node("hippo:initialize", "hippo:initializefolder"); + + $nsItem = $this->folder->addNode("wikipedia", "hippo:initializeitem"); + $nsItem->setProperty("hippo:sequence", value::newDouble(5000)); + $nsItem->setProperty("hippo:namespace", "http://forge.onehippo.org/wikipedia/nt/1.0"); + $nsItem->setProperty("hippo:nodetypesresource", "wikipedia.cnd"); + $nsItem->setProperty("hippo:contentresource", "wikipedia-namespace.xml"); + $nsItem->setProperty("hippo:contentroot", "/hippo:namespaces"); + + $root = $this->folder->addNode("wikipedia-root", "hippo:initializeitem"); + $root->setProperty("hippo:sequence", value::newDouble(20000)); + $root->setProperty("hippo:contentresource", "wikipedia-data.xml"); + $root->setProperty("hippo:contentroot", "/"); + $this->save(); + } + + public function addContent($name, $resource, $sequenceId, $root = "") { + $item = $this->folder->addNode($name, "hippo:initializeitem"); + $item->setProperty("hippo:sequence", value::newDouble(20001 + $sequenceId)); + $item->setProperty("hippo:contentresource", $resource); + $item->setProperty("hippo:contentroot", "/content/documents/wikipedia" . $root); + $this->save(); + } + + private function save() { + $file = fopen(TARGET . "/src/main/resources/hippoecm-extension.xml", "w"); + fwrite($file, $this->folder->toSystemView()); + fclose($file); + } + +} + Added: attic/forge/wikipediaimport/trunk/source/IWikiHandler.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/IWikiHandler.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/IWikiHandler.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,10 @@ +<?php + +include_once dirname(__FILE__) . "/WikiDocument.php"; + +interface IWikiHandler { + + function handle(WikiDocument $document); + + function close(); +} Added: attic/forge/wikipediaimport/trunk/source/JcrDocument.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/JcrDocument.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/JcrDocument.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,28 @@ +<?php + +include_once "library/jcr/node.php"; + +class JcrDocument { + + private $handle; + + public function __construct(WikiDocument $document) { + $this->handle = new node($document->getTitle(), "hippo:handle"); + $this->handle->addMixin("hippo:hardhandle"); + + $article = $this->handle->addNode($document->getTitle(), "wikipedia:article"); + $article->addMixin("hippo:harddocument"); + $article->setProperty("wikipedia:title", $document->getTitle()); + $article->setProperty("wikipedia:html", $document->getHTML()); + $article->setProperty("hippostd:state", "unpublished"); + $article->setProperty("hippostd:stateSummary", "changed"); + $article->setProperty("hippostd:tags", $document->getCategories()); + } + + public function getNode() { + return $this->handle; + } + +} + + Added: attic/forge/wikipediaimport/trunk/source/JcrDocumentWriter.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/JcrDocumentWriter.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/JcrDocumentWriter.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,55 @@ +<?php + +include_once dirname(__FILE__) . "/HippoExtensionWriter.php"; +include_once dirname(__FILE__) . "/JcrDocument.php"; +include_once dirname(__FILE__) . "/JcrWikiFolder.php"; + +class JcrDocumentWriter implements IWikiHandler { + private $folder; + private $files = 1; + private $pages = 0; + private $extensionWriter; + + public function __construct() { + if (!defined("TARGET")) { + die("No TARGET defined"); + } + $this->extensionWriter = new HippoExtensionWriter(); + $this->newFolder(); + } + + public function handle(WikiDocument $document) { + $jcrDocument = new JcrDocument($document); + $article = $this->folder->addDocument($jcrDocument); + + $pages++; + if ($pages == 500){ + $this->nextFolder(); + $pages = 0; + } + return true; + } + + public function close() { + $this->closeFolder(); + } + + private function newFolder() { + $name = "wikipedia-content-{$this->files}"; + $this->folder = new JcrWikiFolder("wikipedia-" . $this->files, TARGET . "/src/main/resources/" . $name . ".xml"); + $this->extensionWriter->addContent($name, $name . ".xml", $this->files); + } + + private function closeFolder(){ + $this->folder->close(); + $this->folder = null; + } + + private function nextFolder(){ + $this->closeFile(); + $this->files++; + $this-->newFile(); + } + +} + Added: attic/forge/wikipediaimport/trunk/source/JcrHashFolder.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/JcrHashFolder.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/JcrHashFolder.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,50 @@ +<?php + +include_once "library/jcr/node.php"; + +/** + * A hash folder contains containers for content folders. + */ +class JcrHashFolder { + + private $fileName; + private $folder; + private $children = array(); + + public function __construct($name, $fileName = null) { + $this->fileName = $fileName; + + $this->folder = new node($name, "hippostd:folder"); + $this->folder->addMixin("hippo:harddocument"); + $this->folder->setProperty("hippostd:foldertype", array("new-folder")); + } + + public function getFolder($hash) { + return $this->children[$hash]; + } + + public function addFolder(JcrWikiFolder $folder) { + $this->folder->addChild($folder->getNode()); + $this->children[$folder->getNode()->getName()] = $folder; + } + + public function save() { + if ($this->fileName == null) { + die ("No file name specified for JcrHashFolder"); + } + $file = fopen($this->fileName, "w"); + fwrite($file, $this->folder->toSystemView()); + fclose($file); + } + + public function close() { + if ($this->fileName != null) { + $this->save(); + } + foreach ($this->children as $name => $folder) { + $folder->close(); + } + } + +} + Added: attic/forge/wikipediaimport/trunk/source/JcrHashFolderWriter.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/JcrHashFolderWriter.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/JcrHashFolderWriter.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,57 @@ +<?php + +include_once dirname(__FILE__) . "/IWikiHandler.php"; +include_once dirname(__FILE__) . "/HippoExtensionWriter.php"; +include_once dirname(__FILE__) . "/JcrDocument.php"; +include_once dirname(__FILE__) . "/JcrWikiFolder.php"; +include_once dirname(__FILE__) . "/JcrHashFolder.php"; + +class JcrHashFolderWriter implements IWikiHandler { + private $folders; + private $files = 1; + private $extensionWriter; + + public function __construct() { + if (!defined("TARGET")) { + die("No TARGET defined"); + } + + $this->extensionWriter = new HippoExtensionWriter(); + $this->folders = array(); + } + + public function handle(WikiDocument $document) { + $md5 = md5($document->getTitle()); + + $firstLevel = substr($md5, 0, 2); + if (!isset($this->folders[$firstLevel])) { + $name = "wikipedia-content-{$firstLevel}"; + $this->folders[$firstLevel] = new JcrHashFolder($firstLevel, + TARGET . "/src/main/resources/" . $name . ".xml"); + $this->extensionWriter->addContent($name, $name . ".xml", ++$this->files); + } + $container = $this->folders[$firstLevel]; + + $secondLevel = substr($md5, 2, 2); + $folder = $container->getFolder($secondLevel); + if ($folder == null) { + $name = "wikipedia-content-{$firstLevel}_{$secondLevel}"; + $folder = new JcrWikiFolder($secondLevel); + $container->addFolder($folder); + } + + $jcrDoc = new JcrDocument($document); + $folder->addDocument($jcrDoc); + $container->save(); + + return true; + } + + public function close() { + foreach ($this->folders as $name => $container) { + $container->close(); + } + } + +} + Added: attic/forge/wikipediaimport/trunk/source/JcrWikiFolder.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/JcrWikiFolder.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/JcrWikiFolder.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,45 @@ +<?php + +include_once "library/jcr/node.php"; + +class JcrWikiFolder { + + private $fileName; + private $folder; + + public function __construct($name, $fileName = null) { + $this->fileName = $fileName; + + $this->folder = new node($name, "hippostd:folder"); + $this->folder->addMixin("hippo:harddocument"); + $this->folder->setProperty("hippostd:foldertype", array("new-document", "new-folder")); + } + + public function getNode() { + return $this->folder; + } + + public function addDocument(JcrDocument $document) { + $handle = $this->folder->addChild($document->getNode()); + if ($this->fileName != null) { + $this->save(); + } + return $handle; + } + + public function close() { + if ($this->fileName != null) { + $this->save(); + } + } + + public function save() { + if ($this->fileName == null) { + die("No file name specified for JcrWikiFolder"); + } + $file = fopen($this->fileName, "w"); + fwrite($file, $this->folder->toSystemView()); + fclose($file); + } +} + Added: attic/forge/wikipediaimport/trunk/source/MavenProject.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/MavenProject.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/MavenProject.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,24 @@ +<?php + +class MavenProject { + + private $target; + + public function __construct($target = "target") { + if (!file_exists($target)) { + die("path $target does not exist"); + } + $this->target = $target; + } + + public function create() { + if (!file_exists($this->target . "/src/main/resources")) { + mkdir($this->target . "/src/main/resources", 0777, true); + } + copy(dirname(__FILE__) . "/../resources/pom.xml", $this->target . "/pom.xml"); + foreach (array("wikipedia.cnd", "wikipedia-data.xml", "wikipedia-namespace.xml") as $file) { + copy(dirname(__FILE__) . "/../resources/" . $file, $this->target . "/src/main/resources/" . $file); + } + } + +} Added: attic/forge/wikipediaimport/trunk/source/OrderedList.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/OrderedList.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/OrderedList.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,64 @@ +<? + +class ListEntry { + var $entry; + var $next; + + function __construct($entry, $next) { + $this->entry = $entry; + $this->next = $next; + } +} + +class OrderedList { + private $start = null; + private $count = 0; + private $max; + + function __construct($max = 1000) { + $this->max = $max; + } + + function compare($a, $b) { + if ($a->getCount() > $b->getCount()) { + return 1; + } else if ($a->getCount() < $b->getCount()) { + return -1; + } else { + return 0; + } + } + + function add($entry) { + if ($this->start == null) { + $this->start = new ListEntry($entry, null); + $this->count = 1; + } else { + // if entry has lower count than first entry, do nothing + if ($this->compare($entry, $this->start->entry) < 0) { + if ($this->count < $this->max) { + $this->start = new ListEntry($entry, $this->start); + $this->count++; + } + return; + } + // traverse while entry has higher count + $cursor = $this->start; + while ($cursor->next != null && $this->compare($entry, $cursor->next->entry) > 0) { + $cursor = $cursor->next; + } + $cursor->next = new ListEntry($entry, $cursor->next); + $this->count++; + if ($this->count > $this->max) { + $this->start = $this->start->next; + $this->count--; + } + } + } + + function getHead() { + return $this->start; + } +} + + Added: attic/forge/wikipediaimport/trunk/source/TestCase.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/TestCase.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/TestCase.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,44 @@ +<? + +class TestCase { + + private $errors = array(); + private $running = false; + + protected function setUp() { + $this->running = true; + } + + protected function tearDown() { + $this->running = false; + } + + public final function getErrors() { + return $this->errors; + } + + public function run() { + $reflection = new ReflectionClass(get_class($this)); + foreach ($reflection->getMethods(ReflectionMethod::IS_PUBLIC) as $method) { + if (substr($method->name, 0, 4) == "test") { + $this->setUp(); + if (!$this->running) { + throw new Exception("A class in the hierarchy of " . get_class($this) . + " did not call parent::setUp in their override"); + } + try { + $method->invoke($this); + } catch (Exception $e) { + echo $e->getTraceAsString() . "\n"; + $this->errors[] = $method->name; + } + $this->tearDown(); + if ($this->running) { + throw new Exception("A class in the hierarchy of " . get_class($this) . + " did not call parent::tearDown in their override"); + } + } + } + } + +} Added: attic/forge/wikipediaimport/trunk/source/WikiDocument.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/WikiDocument.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/WikiDocument.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,44 @@ +<?php + +require_once 'library/wiki/parseRaw.inc.php'; + +class WikiDocument { + private $name; + private $text; + + public function __construct($name, $text) { + $this->name = $name; + $this->text = $text; + } + + public function getName() { + return $this->name; + } + + public function getTitle() { + return str_replace(array('/',"'", ':', '"', '*'), '', $this->name); + } + + public function getText() { + return $this->text; + } + + public function getHTML() { + $markup = simpleText(parseRaw('', $this->text)); + $matches = array(); + preg_match("/.*}}/", $markup, $matches); + $preface = strpos($markup, $matches[0]) + strlen($matches[0]); + return trim(substr($markup, $preface)); + } + + public function getCategories() { + $matches = array(); + $result = array(); + preg_match_all("/\[\[Category:[^|\]]+\]\]/", $this->text, $matches); + foreach ($matches[0] as $match) { + $result[] = substr($match, 11, (strlen($match) - 13)); + } + return $result; + } + +} Added: attic/forge/wikipediaimport/trunk/source/WikiParser.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/WikiParser.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/WikiParser.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,83 @@ +<?php + +include_once dirname(__FILE__) . "/IWikiHandler.php"; +include_once dirname(__FILE__) . "/WikiDocument.php"; + +class WikiParser { + private $handler; + + private $fileName; + private $limit = 1000; + private $showProgress = true; + + function __construct(IWikiHandler $handler) { + $this->handler = $handler; + } + + function setFile($str) { + $this->fileName = $str; + } + + function setLimit($limit) { + $this->limit = $limit; + } + + function setShowProgress($show) { + $this->showProgress = $show; + } + + function parse() { + $file = fopen($this->fileName, "r"); + $articles = 0; + + $inPage = false; + $pagebuffer = ''; + $lines = 0; + while (!feof($file) && ($this->limit == 0 || $articles < $this->limit)) { + // read a line + $linebuffer = fgets($file); + // check if it is a beginning of a page + if (strpos($linebuffer, '<page>') !== false){ + $inPage = true; + } + // in a page write the line to the buffer + if ($inPage){ + $pagebuffer .= $linebuffer; + $lines++; + } + // check for end of a page + if (strpos($linebuffer, '</page>') !== false){ + $inPage = false; + } + // if end of a page transform page xml + if (!$inPage && !empty($pagebuffer)){ + $xmlDom = new DOMDocument(); + $xmlDom->loadXML($pagebuffer); + $document = new WikiDocument($xmlDom->getElementsByTagName('title')->item(0)->nodeValue, + $xmlDom->getElementsByTagName('text')->item(0)->nodeValue); + if ($this->handler->handle($document)) { + $articles++; + if ($this->showProgress && ($articles % 100 == 0)) { + echo "$articles articles parsed "; + if ($this->limit > 0) { + echo "[" . (int) (($articles / $this->limit) * 100) . "%]"; + } + echo "\n"; + } + } + $lines = 0; + $pagebuffer = ''; + unset($xmlDom); + } + } + + $this->handler->close(); + + // clean + fclose($file); + if ($this->showProgress) { + echo "Done parsing [100%]\n"; + } + } +} + Added: attic/forge/wikipediaimport/trunk/source/WriterTestCase.php =================================================================== --- attic/forge/wikipediaimport/trunk/source/WriterTestCase.php (rev 0) +++ attic/forge/wikipediaimport/trunk/source/WriterTestCase.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,38 @@ +<? + +include_once dirname(__FILE__) . "/TestCase.php"; + +class WriterTestCase extends TestCase { + + protected function setUp() { + parent::setUp(); + define("TARGET", "/tmp/wikipedia-test"); + @mkdir(TARGET . "/src/main/resources", 0777, true); + } + + protected function tearDown() { + if (defined("TARGET")) { + $this->delete(TARGET); + } + parent::tearDown(); + } + + private function delete($file) { + if (is_dir($file)) { + $handle = opendir($file); + while ($entry = readdir($handle)) { + if ($entry == "." || $entry == "..") { + continue; + } + $this->delete($file . "/" . $entry); + } + closedir($handle); + rmdir($file); + } else { + unlink($file); + } + } + +} + + Added: attic/forge/wikipediaimport/trunk/test/CategoryCycleDetectorTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/CategoryCycleDetectorTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/CategoryCycleDetectorTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,63 @@ +<? + +include_once "source/CategoryCycleDetector.php"; + +class TestCategory { + private $name; + private $parents; + + public function __construct($name) { + $this->name = $name; + } + + public function getName() { + return $this->name; + } + + public function getParents() { + return $this->parents; + } + + function setParents($parents) { + $this->parents = $parents; + } +} + +class CategoryCycleDetectorTest extends TestCase { + + function testSimpleLoop() { + $a = new TestCategory("a"); + $b = new TestCategory("b"); + $a->setParents(array($b)); + $b->setParents(array($a)); + + $detector = new CategoryCycleDetector(); + $cycles = $detector->getCycles(array($a, $b)); + assert(count($cycles) == 1); + $cycle = $cycles[0]; + assert($cycle[0] == "a"); + assert($cycle[1] == "b"); + } + + function testMultiLoop() { + $a = new TestCategory("a"); + $b = new TestCategory("b"); + $c = new TestCategory("c"); + $a->setParents(array($b)); + $b->setParents(array($c)); + $c->setParents(array($a, $b)); + + $detector = new CategoryCycleDetector(); + $cycles = $detector->getCycles(array($a, $b, $c)); + assert(count($cycles) == 2); + + $cycle = $cycles[0]; + assert($cycle[0] == "a"); + assert($cycle[1] == "b"); + assert($cycle[2] == "c"); + + $cycle = $cycles[1]; + assert($cycle[0] == "b"); + assert($cycle[1] == "c"); + } +} Added: attic/forge/wikipediaimport/trunk/test/CategoryFilterTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/CategoryFilterTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/CategoryFilterTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,47 @@ +<? + +include_once "source/CategoryFilter.php"; + +class TestHandler implements IWikiHandler { + + public $handled = false; + public $closed = false; + + public function handle(WikiDocument $document) { + $this->handled = true; + return true; + } + + public function close() { + $this->closed = true; + } +} + +class CategoryFilterTest extends TestCase { + + public function testCategoryIsHandled() { + $handler = new TestHandler(); + $filter = new CategoryFilter($handler, array("TestCategory")); + $document = new WikiDocument("test-document", "[[Category:TestCategory]]"); + $result = $filter->handle($document); + assert ($result); + assert ($handler->handled); + } + + public function testOtherIsFiltered() { + $handler = new TestHandler(); + $filter = new CategoryFilter($handler, array("TestCategory")); + $document = new WikiDocument("test-document", "test"); + $result = $filter->handle($document); + assert (!$result); + assert (!$handler->handled); + } + + public function testClose() { + $handler = new TestHandler(); + $filter = new CategoryFilter($handler, array("TestCategory")); + $filter->close(); + assert($handler->closed); + } + +} Added: attic/forge/wikipediaimport/trunk/test/CategoryTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/CategoryTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/CategoryTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,38 @@ +<? + +include dirname(__FILE__) . "/../source/Category.php"; + +class CategoryTest extends TestCase { + + protected function tearDown() { + parent::tearDown(); + unlink("category_test.json"); + } + + public function testReload() { + $category = Category::load("xxx"); + Category::save("category_test.json"); + + $contents = file_get_contents("category_test.json"); + $obj = json_decode($contents); + assert(is_array($obj)); + assert($obj[0]->name == "xxx"); + } + + public function testRestore() { + file_put_contents("category_test.json", "[\n{\"name\": \"yyy\", \"count\": 12}\n]"); + Category::restore("category_test.json"); + $category = Category::load("yyy"); + assert($category->getCount() == 12); + + Category::save("category_test.json"); + $cmp = <<<HEREDOC +[ +{"name":"xxx","count":0}, +{"name":"yyy","count":12} +] + +HEREDOC; + assert($cmp == file_get_contents("category_test.json")); + } +} Added: attic/forge/wikipediaimport/trunk/test/HippoExtensionWriterTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/HippoExtensionWriterTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/HippoExtensionWriterTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,19 @@ +<? + +include_once dirname(__FILE__) . "/../source/WriterTestCase.php"; +include_once dirname(__FILE__) . "/../source/HippoExtensionWriter.php"; +include_once dirname(__FILE__) . "/../library/jcr/parser.php"; + +class HippoExtensionWriterTest extends WriterTestCase { + + public function testSaveAfterAddContent() { + $writer = new HippoExtensionWriter(); + $writer->addContent("test", "content.xml", 1); + + $parser = new JcrXmlParser(); + $file = TARGET . "/src/main/resources/hippoecm-extension.xml"; + $node = $parser->parse(file_get_contents($file)); + $children = $node->getNodes("test"); + assert (count($children) == 1); + } +} Added: attic/forge/wikipediaimport/trunk/test/JcrDocumentWriterTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/JcrDocumentWriterTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/JcrDocumentWriterTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,39 @@ +<?php + +include_once "source/WriterTestCase.php"; + +include_once "source/WikiParser.php"; +include_once "source/JcrDocumentWriter.php"; +include_once "library/jcr/parser.php"; + +class JcrDocumentWriterTest extends WriterTestCase { + + function testWriter() { + $writer = new JcrDocumentWriter(); + $document = new WikiDocument("test-title", "test-content"); + $writer->handle($document); + $writer->close(); + + $test = file_get_contents(TARGET . "/src/main/resources/wikipedia-content-1.xml"); + $parser = new JcrXmlParser(); + $node = $parser->parse($test); + assert($node->getName() == "wikipedia-1"); + assert($node->getPrimaryNodeType() == "hippostd:folder"); + assert("hippo:harddocument" == $node->getProperty("jcr:mixinTypes")->getValue()->getString()); + + $children = $node->getNodes(); + assert (count($children) == 1); + $child = $children[0]; + assert ($child->getName() == "test-title"); + assert ($child->getPrimaryNodeType() == "hippo:handle"); + + // todo: verify title and content + $variants = $child->getNodes(); + assert (count($variants) == 1); + $document = $variants[0]; + assert ($document->getName() == "test-title"); + assert ($document->getProperty("wikipedia:title")->getValue()->getString() == "test-title"); + assert ($document->getProperty("wikipedia:html")->getValue()->getString() == "test-content"); + } + +} Added: attic/forge/wikipediaimport/trunk/test/JcrHashFolderWriterTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/JcrHashFolderWriterTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/JcrHashFolderWriterTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,64 @@ +<? + +include_once "library/jcr/parser.php"; +include_once "source/WriterTestCase.php"; +include_once "source/JcrHashFolderWriter.php"; +include_once "source/WikiDocument.php"; + +class JcrHashFolderWriterTest extends WriterTestCase { + + const DOCUMENT_NAME = "test-document"; + + public function testHashFolderIsCreated() { + $writer = new JcrHashFolderWriter(); + $writer->handle(new WikiDocument(self::DOCUMENT_NAME, "content")); + $writer->close(); + + $hash = md5(self::DOCUMENT_NAME); + $name = "wikipedia-content-" . substr($hash, 0, 2); + $file = TARGET . "/src/main/resources/" . $name . ".xml"; + assert(file_exists($file)); + + $parser = new JcrXmlParser(); + $node = $parser->parse(file_get_contents($file)); + assert($node->getName() == substr($hash, 0, 2)); + assert($node->getPrimaryNodeType() == "hippostd:folder"); + + $children = $node->getNodes(substr($hash, 2, 2)); + assert (count($children) == 1); + + $file = TARGET . "/src/main/resources/hippoecm-extension.xml"; + $node = $parser->parse(file_get_contents($file)); + $children = $node->getNodes($name); + assert (count($children) == 1); + } + + public function testSubFolderIsCreated() { + $writer = new JcrHashFolderWriter(); + $writer->handle(new WikiDocument(self::DOCUMENT_NAME, "content")); + $writer->close(); + + $hash = md5(self::DOCUMENT_NAME); + $name = "wikipedia-content-" . substr($hash, 0, 2); + $file = TARGET . "/src/main/resources/" . $name . ".xml"; + assert(file_exists($file)); + + $parser = new JcrXmlParser(); + $node = $parser->parse(file_get_contents($file)); + assert($node->getName() == substr($hash, 0, 2)); + assert($node->getPrimaryNodeType() == "hippostd:folder"); + + $children = $node->getNodes(substr($hash, 2, 2)); + assert (count($children) == 1); + $node = $children[0]; + + $children = $node->getNodes(self::DOCUMENT_NAME); + assert (count($children) == 1); + + $file = TARGET . "/src/main/resources/hippoecm-extension.xml"; + $node = $parser->parse(file_get_contents($file)); + $children = $node->getNodes($name); + assert (count($children) == 1); + } + +} Added: attic/forge/wikipediaimport/trunk/test/JcrXmlParserTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/JcrXmlParserTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/JcrXmlParserTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,42 @@ +<? + +include_once "source/TestCase.php"; +include_once "library/jcr/parser.php"; + +class JcrXmlParserTest extends TestCase { + + public function testParser() { + $parser = new JcrXmlParser(); + $node = $parser->parse(<<<HEREDOC +<?xml version="1.0"?> +<sv:node sv:name="test-node" xmlns:sv="http://www.jcp.org/jcr/sv/1.0"> + <sv:property sv:name="jcr:primaryType"> + <sv:value>jcr:test</sv:value> + </sv:property> + <sv:node sv:name="child-node"> + <sv:property sv:name="jcr:primaryType"> + <sv:value>jcr:child</sv:value> + </sv:property> + <sv:property sv:name="test-property"> + <sv:value>test</sv:value> + </sv:property> + <sv:property sv:name="test-multi"> + <sv:value>aap</sv:value> + <sv:value>noot</sv:value> + </sv:property> + </sv:node> +</sv:node> +HEREDOC + ); + assert ($node->getProperty("jcr:primaryType")->getValue()->getString() == "jcr:test"); + assert (count($node->getNodes()) == 1); + $children = $node->getNodes(); + $child = $children[0]; + assert ($child->getProperty("jcr:primaryType")->getValue()->getString() == "jcr:child"); + assert ($child->getProperty("test-property")->getValue()->getString() == "test"); + $values = $child->getProperty("test-multi")->getValues(); + assert ($values[0]->getString() == "aap"); + assert ($values[1]->getString() == "noot"); + } +} + Added: attic/forge/wikipediaimport/trunk/test/MavenProjectTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/MavenProjectTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/MavenProjectTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,15 @@ +<?php + +include_once dirname(__FILE__) . "/../source/WriterTestCase.php"; +include_once dirname(__FILE__) . "/../source/MavenProject.php"; + +class MavenProjectTest extends WriterTestCase { + + public function testResourcesAreCopied() { + $project = new MavenProject(TARGET); + $project->create(); + + assert(file_exists(TARGET . "/pom.xml")); + assert(file_exists(TARGET . "/src/main/resources/wikipedia.cnd")); + } +} Added: attic/forge/wikipediaimport/trunk/test/OrderedListTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/OrderedListTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/OrderedListTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,58 @@ +<? + +include_once "source/TestCase.php"; +include_once "source/OrderedList.php"; + +class entry { + var $count; + + function __construct($count) { + $this->count = $count; + } + + function getCount() { + return $this->count; + } +} + +class OrderedListTest extends TestCase { + + function testSimpleOrdering() { + $list = new OrderedList(); + $list->add(new entry(2)); + $list->add(new entry(1)); + + $le = $list->getHead(); + assert($le->entry->count == 1); + $le = $le->next; + assert($le->entry->count == 2); + assert($le->next == null); + } + + function testDoublures() { + $list = new OrderedList(); + $list->add(new entry(1)); + $list->add(new entry(1)); + + $le = $list->getHead(); + assert($le->entry->count == 1); + $le = $le->next; + assert($le->entry->count == 1); + assert($le->next == null); + } + + function testMaximum() { + $list = new OrderedList(2); + $list->add(new entry(1)); + $list->add(new entry(2)); + $list->add(new entry(3)); + + $le = $list->getHead(); + assert($le->entry->count == 2); + $le = $le->next; + assert($le->entry->count == 3); + assert($le->next == null); + } + +} + Added: attic/forge/wikipediaimport/trunk/test/TestCaseTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/TestCaseTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/TestCaseTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,80 @@ +<? + +include_once "source/TestCase.php"; + +class TestClass extends TestCase { + public $hasRun = false; + public $setup = false; + public $teardown = false; + + protected function setUp() { + parent::setUp(); + $this->setup = true; + } + + protected function tearDown() { + $this->teardown = true; + parent::tearDown(); + } + + public function testHello() { + $this->hasRun = true; + } + +} + +class TestException extends TestCase { + public $teardown = false; + + protected function tearDown() { + $this->teardown = true; + parent::tearDown(); + } + + public function testThrow() { + throw new Exception("Something went horribly wrong!"); + } +} + +class TestTeardown extends TestCase { + protected function tearDown() { + } + + public function testTeardown() { + } +} + +class TestCaseTest extends TestCase { + + public function testSubclass() { + $test = new TestClass(); + $test->run(); + assert ($test->setup); + assert ($test->hasRun); + assert ($test->teardown); + } + + public function testTeardownRunsAfterException() { + $test = new TestException(); + @ob_start(); + $test->run(); + ob_end_clean(); + assert ($test->teardown); + assert (in_array("testThrow", $test->getErrors())); + } + + public function testTeardownMustCallSuper() { + $test = new TestTeardown(); + $caught = false; + try { + $test->run(); + } catch (Exception $e) { + // this is OK + $caught = true; + } + if (!$caught) { + throw new Exception("Teardown did not have to call parent"); + } + } + +} Added: attic/forge/wikipediaimport/trunk/test/WikiDocumentTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/WikiDocumentTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/WikiDocumentTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,20 @@ +<?php + +include_once dirname(__FILE__) . "/../source/TestCase.php"; +include_once dirname(__FILE__) . "/../source/WikiDocument.php"; + +class WikiDocumentTest extends TestCase { + + public function testMultiCategory() { + $document = new WikiDocument("xyz", "[[Category:a:b]] [[Category:c]]"); + assert(count($document->getCategories()) == 2); + $categories = $document->getCategories(); + assert($categories[0] == "a:b"); + } + + public function testBasic() { + $document = new WikiDocument("test-title", "test-content"); + assert ("test-title" == $document->getTitle()); + assert ("test-content" == $document->getHtml()); + } +} Added: attic/forge/wikipediaimport/trunk/test/WriterTestCaseTest.php =================================================================== --- attic/forge/wikipediaimport/trunk/test/WriterTestCaseTest.php (rev 0) +++ attic/forge/wikipediaimport/trunk/test/WriterTestCaseTest.php 2017-03-02 11:16:37 UTC (rev 58746) @@ -0,0 +1,22 @@ +<? + +include_once "source/WriterTestCase.php"; + +class WriterTestClass extends WriterTestCase { + public $defined = false; + + public function testDefined() { + if (defined("TARGET")) { + $this->defined = true; + } + } +} + +class WriterTestCaseTest extends TestCase { + + public function testTargetIsDefined() { + $test = new WriterTestClass(); + $test->run(); + assert ($test->defined); + } +} _______________________________________________ Hippocms-svn mailing list Hippocms-svn@lists.onehippo.org https://lists.onehippo.org/mailman/listinfo/hippocms-svn