diff --git a/lib/Gscan2pdf/Page.pm b/lib/Gscan2pdf/Page.pm
index 0fb2296..a397880 100644
--- a/lib/Gscan2pdf/Page.pm
+++ b/lib/Gscan2pdf/Page.pm
@@ -111,17 +111,27 @@ sub thaw {
 sub boxes {
  my ( $self, @boxes ) = @_;
 
+ # Unfortunately, there seems to be a case (tested in t/31_ocropus_utf8.t)
+ # where decode_entities doesn't work cleanly, so encode/decode to finally
+ # get good UTF-8
+ $self->{hocr} =
+   decode_utf8(
+  encode_utf8( HTML::Entities::decode_entities( $self->{hocr} ) ) );
+
  if ( $self->{hocr} =~ /<body>([\s\S]*)<\/body>/x ) {
   my $p = HTML::TokeParser->new( \$self->{hocr} );
   my ( $x1, $y1, $x2, $y2, $text );
   while ( my $token = $p->get_token ) {
    if ( $token->[0] eq 'S' ) {
-    if ( $token->[1] eq 'span'
+    if (
+         $token->[1] eq 'span'
      and defined( $token->[2]{class} )
-     and
-     ( $token->[2]{class} eq 'ocr_line' or $token->[2]{class} eq 'ocr_word' )
+     and ( $token->[2]{class} eq 'ocr_line'
+      or $token->[2]{class} eq 'ocr_word'
+      or $token->[2]{class} eq 'ocrx_word' )
      and defined( $token->[2]{title} )
-     and $token->[2]{title} =~ /bbox\ (\d+)\ (\d+)\ (\d+)\ (\d+)/x )
+     and $token->[2]{title} =~ /bbox\ (\d+)\ (\d+)\ (\d+)\ (\d+)/x
+      )
     {
      ( $x1, $y1, $x2, $y2 ) = ( $1, $2, $3, $4 );
     }
@@ -134,13 +144,7 @@ sub boxes {
     }
    }
    if ( $token->[0] eq 'T' and $token->[1] !~ /^\s*$/x ) {
-
-    # Unfortunately, there seems to be a case (tested in t/31_ocropus_utf8.t)
-    # where decode_entities doesn't work cleanly, so encode/decode to finally
-    # get good UTF-8
-    $text =
-      decode_utf8(
-     encode_utf8( HTML::Entities::decode_entities( $token->[1] ) ) );
+    $text = $token->[1];
     chomp($text);
    }
    if ( $token->[0] eq 'E' ) {