Author: kn Date: Fri Feb 8 14:11:39 2008 New Revision: 7323 Log: - Added tokenizer exception - Test for not existant files - Sucessfully tokenize titles
Added: experimental/Document/src/exceptions/rst_tokenizer.php (with props) experimental/Document/tests/files/rst/tokenizer/titles.tokens Modified: experimental/Document/design/class_diagram.png experimental/Document/src/document/rst/token.php experimental/Document/src/document/rst/tokenizer.php experimental/Document/src/document_autoload.php experimental/Document/tests/document_rst_tokenizer_tests.php experimental/Document/tests/files/rst/tokenizer/ (props changed) Modified: experimental/Document/design/class_diagram.png ============================================================================== Binary files - no diff available. Modified: experimental/Document/src/document/rst/token.php ============================================================================== --- experimental/Document/src/document/rst/token.php [iso-8859-1] (original) +++ experimental/Document/src/document/rst/token.php [iso-8859-1] Fri Feb 8 14:11:39 2008 @@ -21,6 +21,10 @@ // Whitespace tokens const WHITESPACE = 1; const NEWLINE = 2; + + const HEADLINE = 11; + + const TEXT_LINE = 99; /** * Token type @@ -67,6 +71,23 @@ $this->line = $line; $this->position = $position; } + + /** + * Set state after var_export + * + * @param array $properties + * @return void + * @ignore + */ + public static function __set_state( $properties ) + { + return new ezcDocumentRstToken( + $properties['type'], + $properties['content'], + $properties['line'], + $properties['position'] + ); + } } ?> Modified: experimental/Document/src/document/rst/tokenizer.php ============================================================================== --- experimental/Document/src/document/rst/tokenizer.php [iso-8859-1] (original) +++ experimental/Document/src/document/rst/tokenizer.php [iso-8859-1] Fri Feb 8 14:11:39 2008 @@ -19,18 +19,75 @@ class ezcDocumentRstTokenizer { /** + * Allowed character sets for headlines. + * + * @see http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#sections + */ + const HEADLINE_CHARS = '!"#$%&\'()*+,-./:;<=>[EMAIL PROTECTED]|}~'; + + /** + * Allowed character sets for table lines. + * + * @see http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#sections + */ + const TABLE_CHARS = '!"#$%&\'()*+,-./:;<=>[EMAIL PROTECTED]|}~ '; + + /** + * Characters to start bullet lists. Prepared for inclusion in regular + * expression character groups. + * + * @see http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#bullet-lists + */ + const BULLET_LIST_CHARS = '*+\\x{e280a2}\\x{e280a3}\\x{e28183}-'; + + /** + * Characters to start enumerated lists. Prepared for inclusion in regular + * expressions. + * + * @see http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#enumerated-lists + */ + const ENUM_LIST_CHARS = '(?P<enum>\\d+|[A-Z]|[a-z]|[IVXLCDM]+|[ivxlcdm]+|#)'; + + /** + * Characters ending a pure text section. + * + * @see http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#enumerated-lists + */ + const TEXT_END_CHARS = '`\'\\s'; + + /** * List with tokens and a regular expression matching the given token. * * The tokens are matched in the given order. * * @var array */ - protected $tokens = array( - ezcDocumentRstToken::WHITESPACE => - '(^(?P<value>[ \t]+))', - ezcDocumentRstToken::NEWLINE => - '(^(?P<value>\r\n|\r|\n))', - ); + protected $tokens = array(); + + /** + * Construct tokenizer + * + * Create token array with regular repression matching the respective + * token. + * + * @return void + */ + public function __construct() + { + $this->tokens = array( + ezcDocumentRstToken::WHITESPACE => + '(\\A(?P<value>[ \\t]+))', + ezcDocumentRstToken::NEWLINE => + '(\\A(?P<value>\\r\\n|\\r|\\n))', + + ezcDocumentRstToken::HEADLINE => + '(\\A(?P<value>([' . self::HEADLINE_CHARS . '])\\2*)$)m', + + // This should be last match + ezcDocumentRstToken::TEXT_LINE => + '(\\A(?P<value>[^' . self::TEXT_END_CHARS . ']*))', + ); + } /** * Tokenize the given file @@ -47,10 +104,41 @@ { if ( !file_exists( $file ) || !is_readable( $file ) ) { - ezcBaseFileNotFoundException( $file ); + throw new ezcBaseFileNotFoundException( $file ); } return $this->tokenizeString( file_get_contents( $file ) ); + } + + protected function dumpTokenizerState( $line, $position, $tokens, $string ) + { + $lastToken = end( $tokens ); + + // Get last 5 token in reverse order + $tokens = array_reverse( $tokens ); + $lastTokensString = ''; + foreach ( array_splice( $tokens, 0, 5 ) as $token ) + { + $lastTokensString .= $token->type . ', '; + } + + printf( + "\nAt line %d char %d in string '%s'.\n", + $line, $position, substr( $string, 0, 20 ) + ); + + if ( $lastToken ) + { + printf( + "- Last token read: (%d, '%s', %d:%d)\n", + $lastToken->type, $lastToken->content, $lastToken->line, $lastToken->position + ); + } + + printf( + "- Last tokens in list: %s\n", + $lastTokensString + ); } /** @@ -74,6 +162,8 @@ { foreach ( $this->tokens as $token => $expression ) { +// $this->dumpTokenizerState( $line, $position, $tokens, $string ); + if ( preg_match( $expression, $string, $matches ) ) { // A token matched, so add the matched token to the token @@ -86,21 +176,26 @@ ); // Removed matched stuff from input string - $string = substr( $string, $position += strlen( $matches[0] ) ); + $string = substr( $string, $length = strlen( $matches[0] ) ); + $position += $length; // On a newline token reset the line position and increase the line value - if ( $token === ezcDocumentRstToken::NEWLINE ) + if ( preg_match( '(\r\n|\r|\n)', $matches[0] ) ) { ++$line; $position = 1; } + + // Restart the while loop, because we matched a token and + // can retry with shortened string. + continue 2; } } // None of the token definitions matched the input string. We throw // an exception with the position of the content in the input // string and the contents we could not match. - throw new ezcDocumentTokenizerException( + throw new ezcDocumentRstTokenizerException( $line, $position, $string Modified: experimental/Document/src/document_autoload.php ============================================================================== --- experimental/Document/src/document_autoload.php [iso-8859-1] (original) +++ experimental/Document/src/document_autoload.php [iso-8859-1] Fri Feb 8 14:11:39 2008 @@ -12,6 +12,7 @@ return array( 'ezcDocumentException' => 'Document/exceptions/exception.php', 'ezcDocumentErrnousXmlException' => 'Document/exceptions/errnous_xml.php', + 'ezcDocumentRstTokenizerException' => 'Document/exceptions/rst_tokenizer.php', 'ezcDocument' => 'Document/interfaces/document.php', 'ezcDocumentBaseOptions' => 'Document/options/document_base.php', 'ezcDocumentConverter' => 'Document/interfaces/converter.php', Added: experimental/Document/src/exceptions/rst_tokenizer.php ============================================================================== --- experimental/Document/src/exceptions/rst_tokenizer.php (added) +++ experimental/Document/src/exceptions/rst_tokenizer.php [iso-8859-1] Fri Feb 8 14:11:39 2008 @@ -1,0 +1,34 @@ +<?php +/** + * Base exception for the Document package. + * + * @package Document + * @version //autogentag// + * @copyright Copyright (C) 2005-2008 eZ systems as. All rights reserved. + * @license http://ez.no/licenses/new_bsd New BSD License + */ + +/** + * Exception thrown, when the RST tokenizer could not tokenize a character + * sequence. + * + * @package Document + * @version //autogentag// + */ +class ezcDocumentRstTokenizerException extends ezcDocumentException +{ + /** + * Construct exception from array with XML errors + * + * @param array $errors + * @return void + */ + public function __construct( $line, $position, $string ) + { + parent::__construct( + "Could not tokenize string at line $line char $position: '" . substr( $string, 0, 10 ) . "'." + ); + } +} + +?> Propchange: experimental/Document/src/exceptions/rst_tokenizer.php ------------------------------------------------------------------------------ svn:eol-style = native Modified: experimental/Document/tests/document_rst_tokenizer_tests.php ============================================================================== --- experimental/Document/tests/document_rst_tokenizer_tests.php [iso-8859-1] (original) +++ experimental/Document/tests/document_rst_tokenizer_tests.php [iso-8859-1] Fri Feb 8 14:11:39 2008 @@ -59,12 +59,28 @@ $expected = include $to; + file_put_contents( $to . '.test', "<?php\n\nreturn " . var_export( $tokens, true ) . ";\n\n" ); + $this->assertEquals( $expected, $tokens, 'Extracted tokens do not match expected tokens.' ); } + + public function testNotExistantFile() + { + try + { + $tokenizer = new ezcDocumentRstTokenizer(); + $tokens = $tokenizer->tokenizeFile( + dirname( __FILE__ ) . '/files/rst/tokenizer/not_existant_file.txt' + ); + $this->fail( 'Expected ezcBaseFileNotFoundException.' ); + } + catch ( ezcBaseFileNotFoundException $e ) + { /* Expected */ } + } } ?> Propchange: experimental/Document/tests/files/rst/tokenizer/ ------------------------------------------------------------------------------ svn:ingore = *.test Added: experimental/Document/tests/files/rst/tokenizer/titles.tokens ============================================================================== --- experimental/Document/tests/files/rst/tokenizer/titles.tokens (added) +++ experimental/Document/tests/files/rst/tokenizer/titles.tokens [iso-8859-1] Fri Feb 8 14:11:39 2008 @@ -1,0 +1,613 @@ +<?php + +return array ( + 0 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '===============', + 'line' => 1, + 'position' => 1, + )), + 1 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 1, + 'position' => 16, + )), + 2 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 2, + 'position' => 1, + )), + 3 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 2, + 'position' => 2, + )), + 4 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 2, + 'position' => 9, + )), + 5 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 2, + 'position' => 10, + )), + 6 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 2, + 'position' => 15, + )), + 7 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '===============', + 'line' => 3, + 'position' => 1, + )), + 8 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 3, + 'position' => 16, + )), + 9 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 4, + 'position' => 1, + )), + 10 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '---------------', + 'line' => 5, + 'position' => 1, + )), + 11 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 5, + 'position' => 16, + )), + 12 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 6, + 'position' => 1, + )), + 13 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 6, + 'position' => 2, + )), + 14 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 6, + 'position' => 9, + )), + 15 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 6, + 'position' => 10, + )), + 16 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 6, + 'position' => 15, + )), + 17 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '---------------', + 'line' => 7, + 'position' => 1, + )), + 18 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 7, + 'position' => 16, + )), + 19 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 8, + 'position' => 1, + )), + 20 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 9, + 'position' => 1, + )), + 21 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 9, + 'position' => 8, + )), + 22 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 9, + 'position' => 9, + )), + 23 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 9, + 'position' => 14, + )), + 24 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '=============', + 'line' => 10, + 'position' => 1, + )), + 25 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 10, + 'position' => 14, + )), + 26 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 11, + 'position' => 1, + )), + 27 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 12, + 'position' => 1, + )), + 28 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 12, + 'position' => 8, + )), + 29 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 12, + 'position' => 9, + )), + 30 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 12, + 'position' => 14, + )), + 31 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '-------------', + 'line' => 13, + 'position' => 1, + )), + 32 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 13, + 'position' => 14, + )), + 33 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 14, + 'position' => 1, + )), + 34 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 15, + 'position' => 1, + )), + 35 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 15, + 'position' => 8, + )), + 36 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 15, + 'position' => 9, + )), + 37 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 15, + 'position' => 14, + )), + 38 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '`````````````', + 'line' => 16, + 'position' => 1, + )), + 39 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 16, + 'position' => 14, + )), + 40 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 17, + 'position' => 1, + )), + 41 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 18, + 'position' => 1, + )), + 42 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 18, + 'position' => 8, + )), + 43 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 18, + 'position' => 9, + )), + 44 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 18, + 'position' => 14, + )), + 45 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '\'\'\'\'\'\'\'\'\'\'\'\'\'', + 'line' => 19, + 'position' => 1, + )), + 46 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 19, + 'position' => 14, + )), + 47 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 20, + 'position' => 1, + )), + 48 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 21, + 'position' => 1, + )), + 49 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 21, + 'position' => 8, + )), + 50 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 21, + 'position' => 9, + )), + 51 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 21, + 'position' => 14, + )), + 52 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '.............', + 'line' => 22, + 'position' => 1, + )), + 53 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 22, + 'position' => 14, + )), + 54 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 23, + 'position' => 1, + )), + 55 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 24, + 'position' => 1, + )), + 56 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 24, + 'position' => 8, + )), + 57 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 24, + 'position' => 9, + )), + 58 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 24, + 'position' => 14, + )), + 59 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '~~~~~~~~~~~~~', + 'line' => 25, + 'position' => 1, + )), + 60 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 25, + 'position' => 14, + )), + 61 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 26, + 'position' => 1, + )), + 62 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 27, + 'position' => 1, + )), + 63 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 27, + 'position' => 8, + )), + 64 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 27, + 'position' => 9, + )), + 65 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 27, + 'position' => 14, + )), + 66 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '*************', + 'line' => 28, + 'position' => 1, + )), + 67 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 28, + 'position' => 14, + )), + 68 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 29, + 'position' => 1, + )), + 69 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 30, + 'position' => 1, + )), + 70 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 30, + 'position' => 8, + )), + 71 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 30, + 'position' => 9, + )), + 72 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 30, + 'position' => 14, + )), + 73 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '+++++++++++++', + 'line' => 31, + 'position' => 1, + )), + 74 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 31, + 'position' => 14, + )), + 75 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 32, + 'position' => 1, + )), + 76 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Section', + 'line' => 33, + 'position' => 1, + )), + 77 => + ezcDocumentRstToken::__set_state(array( + 'type' => 1, + 'content' => ' ', + 'line' => 33, + 'position' => 8, + )), + 78 => + ezcDocumentRstToken::__set_state(array( + 'type' => 99, + 'content' => 'Title', + 'line' => 33, + 'position' => 9, + )), + 79 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 33, + 'position' => 14, + )), + 80 => + ezcDocumentRstToken::__set_state(array( + 'type' => 11, + 'content' => '^^^^^^^^^^^^^', + 'line' => 34, + 'position' => 1, + )), + 81 => + ezcDocumentRstToken::__set_state(array( + 'type' => 2, + 'content' => ' +', + 'line' => 34, + 'position' => 14, + )), +); + -- svn-components mailing list svn-components@lists.ez.no http://lists.ez.no/mailman/listinfo/svn-components