Author: kn
Date: Fri Feb  8 12:35:13 2008
New Revision: 7322

Log:
- Added tokenizer test infrastructure
- Added basic tokenizer
- Tokinizer works for empty files! ;)

Added:
    experimental/Document/src/document/rst/
    experimental/Document/src/document/rst/token.php   (with props)
    experimental/Document/src/document/rst/tokenizer.php   (with props)
    experimental/Document/tests/document_rst_tokenizer_tests.php   (with props)
    experimental/Document/tests/files/rst/tokenizer/empty.tokens
Modified:
    experimental/Document/design/class_diagram.png
    experimental/Document/src/document_autoload.php
    experimental/Document/tests/suite.php

Modified: experimental/Document/design/class_diagram.png
==============================================================================
Binary files - no diff available.

Added: experimental/Document/src/document/rst/token.php
==============================================================================
--- experimental/Document/src/document/rst/token.php (added)
+++ experimental/Document/src/document/rst/token.php [iso-8859-1] Fri Feb  8 
12:35:13 2008
@@ -1,0 +1,72 @@
+<?php
+/**
+ * File containing the ezcDocumentRstToken struct
+ *
+ * @package Document
+ * @version //autogen//
+ * @copyright Copyright (C) 2005-2008 eZ systems as. All rights reserved.
+ * @license http://ez.no/licenses/new_bsd New BSD License
+ */
+
+/**
+ * Struct for RST document document tokens
+ * 
+ * @package Document
+ * @version //autogen//
+ * @copyright Copyright (C) 2005-2008 eZ systems as. All rights reserved.
+ * @license http://ez.no/licenses/new_bsd New BSD License
+ */
+class ezcDocumentRstToken extends ezcBaseStruct
+{
+    // Whitespace tokens
+    const WHITESPACE    = 1;
+    const NEWLINE       = 2;
+
+    /**
+     * Token type
+     * 
+     * @var int
+     */
+    public $type;
+
+    /**
+     * Token content
+     * 
+     * @var mixed
+     */
+    public $content;
+
+    /**
+     * Line of the token in the source file
+     * 
+     * @var int
+     */
+    public $line;
+
+    /**
+     * Position of the token in its line.
+     * 
+     * @var int
+     */
+    public $position;
+
+    /**
+     * Construct RST token
+     * 
+     * @ignore
+     * @param int $type 
+     * @param mixed $content 
+     * @param int $line 
+     * @param int $position 
+     * @return void
+     */
+    public function __construct( $type, $content, $line, $position = 0 )
+    {
+        $this->type         = $type;
+        $this->content      = $content;
+        $this->line         = $line;
+        $this->position     = $position;
+    }
+}
+
+?>

Propchange: experimental/Document/src/document/rst/token.php
------------------------------------------------------------------------------
    svn:eol-style = native

Added: experimental/Document/src/document/rst/tokenizer.php
==============================================================================
--- experimental/Document/src/document/rst/tokenizer.php (added)
+++ experimental/Document/src/document/rst/tokenizer.php [iso-8859-1] Fri Feb  
8 12:35:13 2008
@@ -1,0 +1,114 @@
+<?php
+/**
+ * File containing the ezcDocumentRstTokenizer
+ *
+ * @package Document
+ * @version //autogen//
+ * @copyright Copyright (C) 2005-2008 eZ systems as. All rights reserved.
+ * @license http://ez.no/licenses/new_bsd New BSD License
+ */
+
+/**
+ * Tokenizer for RST documents
+ * 
+ * @package Document
+ * @version //autogen//
+ * @copyright Copyright (C) 2005-2008 eZ systems as. All rights reserved.
+ * @license http://ez.no/licenses/new_bsd New BSD License
+ */
+class ezcDocumentRstTokenizer
+{
+    /**
+     * List with tokens and a regular expression matching the given token.
+     *
+     * The tokens are matched in the given order.
+     * 
+     * @var array
+     */
+    protected $tokens = array(
+        ezcDocumentRstToken::WHITESPACE =>
+            '(^(?P<value>[ \t]+))',
+        ezcDocumentRstToken::NEWLINE =>
+            '(^(?P<value>\r\n|\r|\n))',
+    );
+
+    /**
+     * Tokenize the given file
+     * 
+     * The method tries to tokenize the passed files and returns an array of
+     * ezcDocumentRstToken struct on succes, or throws a
+     * ezcDocumentTokenizerException, if something could not be matched by any
+     * token.
+     *
+     * @param string $file 
+     * @return array
+     */
+    public function tokenizeFile( $file )
+    {
+        if ( !file_exists( $file ) || !is_readable( $file ) )
+        {
+            ezcBaseFileNotFoundException( $file );
+        }
+
+        return $this->tokenizeString( file_get_contents( $file ) );
+    }
+
+    /**
+     * Tokenize the given string
+     * 
+     * The method tries to tokenize the passed strings and returns an array of
+     * ezcDocumentRstToken struct on succes, or throws a
+     * ezcDocumentTokenizerException, if something could not be matched by any
+     * token.
+     *
+     * @param string $string 
+     * @return array
+     */
+    public function tokenizeString( $string )
+    {
+        $line = 1;
+        $position = 1;
+        $tokens = array();
+
+        while ( strlen( $string ) > 0 )
+        {
+            foreach ( $this->tokens as $token => $expression )
+            {
+                if ( preg_match( $expression, $string, $matches ) )
+                {
+                    // A token matched, so add the matched token to the token
+                    // list and update all variables.
+                    $tokens[] = new ezcDocumentRstToken(
+                        $token,
+                        ( isset( $matches['value'] ) ? $matches['value'] : 
null ),
+                        $line,
+                        $position
+                    );
+
+                    // Removed matched stuff from input string
+                    $string = substr( $string, $position += strlen( 
$matches[0] ) );
+
+                    // On a newline token reset the line position and increase 
the line value
+                    if ( $token === ezcDocumentRstToken::NEWLINE )
+                    {
+                        ++$line;
+                        $position = 1;
+                    }
+                }
+            }
+
+            // None of the token definitions matched the input string. We throw
+            // an exception with the position of the content in the input
+            // string and the contents we could not match.
+            throw new ezcDocumentTokenizerException(
+                $line,
+                $position,
+                $string
+            );
+        }
+
+        return $tokens;
+    }
+}
+
+?>

Propchange: experimental/Document/src/document/rst/tokenizer.php
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: experimental/Document/src/document_autoload.php
==============================================================================
--- experimental/Document/src/document_autoload.php [iso-8859-1] (original)
+++ experimental/Document/src/document_autoload.php [iso-8859-1] Fri Feb  8 
12:35:13 2008
@@ -33,6 +33,8 @@
     'ezcDocumentRelaxNgValidator'           => 
'Document/validator/realxng.php',
     'ezcDocumentRst'                        => 'Document/document/rst.php',
     'ezcDocumentRstOptions'                 => 
'Document/options/document_rst.php',
+    'ezcDocumentRstToken'                   => 
'Document/document/rst/token.php',
+    'ezcDocumentRstTokenizer'               => 
'Document/document/rst/tokenizer.php',
     'ezcDocumentXhtml'                      => 
'Document/document/xml/xhtml.php',
     'ezcDocumentXhtmlOptions'               => 
'Document/options/document_xhtml.php',
     'ezcDocumentXhtmlToDocbookConverter'    => 
'Document/converters/xslt/xhtml_docbook.php',

Added: experimental/Document/tests/document_rst_tokenizer_tests.php
==============================================================================
--- experimental/Document/tests/document_rst_tokenizer_tests.php (added)
+++ experimental/Document/tests/document_rst_tokenizer_tests.php [iso-8859-1] 
Fri Feb  8 12:35:13 2008
@@ -1,0 +1,70 @@
+<?php
+/**
+ * ezcDocumentConverterEzp3TpEzp4Tests
+ * 
+ * @package Document
+ * @version //autogen//
+ * @subpackage Tests
+ * @copyright Copyright (C) 2005-2007 eZ systems as. All rights reserved.
+ * @license http://ez.no/licenses/new_bsd New BSD License
+ */
+
+/**
+ * Test suite for class.
+ * 
+ * @package Document
+ * @subpackage Tests
+ */
+class ezcDocumentRstTokenizerTests extends ezcTestCase
+{
+    protected static $testDocuments = null;
+
+    public static function suite()
+    {
+        return new PHPUnit_Framework_TestSuite( __CLASS__ );
+    }
+
+    public static function getTestDocuments()
+    {
+        if ( self::$testDocuments === null )
+        {
+            // Get a list of all test files from the respektive folder
+            $testFiles = glob( dirname( __FILE__ ) . 
'/files/rst/tokenizer/*.txt' );
+
+            // Create array with the test file and the expected result file
+            foreach ( $testFiles as $file )
+            {
+                self::$testDocuments[] = array(
+                    $file,
+                    substr( $file, 0, -3 ) . 'tokens'
+                );
+            }
+        }
+
+        return self::$testDocuments;
+    }
+
+    /**
+     * @dataProvider getTestDocuments
+     */
+    public function testLoadXmlDocumentFromFile( $from, $to )
+    {
+        if ( !is_file( $to ) )
+        {
+            $this->markTestSkipped( "Comparision file '$to' not yet defined." 
);
+        }
+
+        $tokenizer = new ezcDocumentRstTokenizer();
+        $tokens = $tokenizer->tokenizeFile( $from );
+
+        $expected = include $to;
+
+        $this->assertEquals(
+            $expected,
+            $tokens,
+            'Extracted tokens do not match expected tokens.'
+        );
+    }
+}
+
+?>

Propchange: experimental/Document/tests/document_rst_tokenizer_tests.php
------------------------------------------------------------------------------
    svn:eol-style = native

Added: experimental/Document/tests/files/rst/tokenizer/empty.tokens
==============================================================================
--- experimental/Document/tests/files/rst/tokenizer/empty.tokens (added)
+++ experimental/Document/tests/files/rst/tokenizer/empty.tokens [iso-8859-1] 
Fri Feb  8 12:35:13 2008
@@ -1,0 +1,3 @@
+<?php
+
+return array();

Modified: experimental/Document/tests/suite.php
==============================================================================
--- experimental/Document/tests/suite.php [iso-8859-1] (original)
+++ experimental/Document/tests/suite.php [iso-8859-1] Fri Feb  8 12:35:13 2008
@@ -19,6 +19,7 @@
 
 require_once 'document_options_xml_base_test.php';
 require_once 'document_xml_base_test.php';
+require_once 'document_rst_tokenizer_tests.php';
 
 require_once 'converter_options_ezp3_ezp4_test.php';
 require_once 'converter_ezp3_ezp4_test.php';
@@ -39,6 +40,7 @@
 
         $this->addTest( ezcDocumentOptionsXmlBaseTests::suite() );
         $this->addTest( ezcDocumentXmlBaseTests::suite() );
+        $this->addTest( ezcDocumentRstTokenizerTests::suite() );
 
         $this->addTest( ezcDocumentConverterOptionsEzp3ToEzp4Tests::suite() );
         $this->addTest( ezcDocumentConverterEzp3ToEzp4Tests::suite() );


-- 
svn-components mailing list
svn-components@lists.ez.no
http://lists.ez.no/mailman/listinfo/svn-components

Reply via email to