[tesseract-ocr] the unexpected result after trained data

Ali hussain Wed, 09 Aug 2023 10:39:25 -0700

I have trained some new fonts by fine-tune methods for the Bengali language 
in Tesseract 5 and I have used all official trained_text and other things 
also.  everything is good but the problem is the default font which was 
trained before that does not convert text like prev but my new fonts work 
well. I don't understand why it's happening. I share code based to 
understand what going on.



*codes  for creating tif, gt.txt, .box files:*
import os
import random
import pathlib
import subprocess
import argparse
from FontList import FontList

def read_line_count():
    if os.path.exists('line_count.txt'):
        with open('line_count.txt', 'r') as file:
            return int(file.read())
    return 0

def write_line_count(line_count):
    with open('line_count.txt', 'w') as file:
        file.write(str(line_count))

def create_training_data(training_text_file, font_list, output_directory, 
start_line=None, end_line=None):
    lines = []
    with open(training_text_file, 'r') as input_file:
        for line in input_file.readlines():
            lines.append(line.strip())
    
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)
    
    random.shuffle(lines)
    
    if start_line is None:
        line_count = read_line_count()  # Set the starting line_count from 
the file
    else:
        line_count = start_line
    
    if end_line is None:
        end_line_count = len(lines) - 1  # Set the ending line_count
    else:
        end_line_count = min(end_line, len(lines) - 1)
    
    for font in font_list.fonts:  # Iterate through all the fonts in the 
font_list
        font_serial = 1
        for line in lines:
            training_text_file_name = pathlib.Path(training_text_file).stem
            
            # Generate a unique serial number for each line
            line_serial = f"{line_count:d}"
            
            # GT (Ground Truth) text filename
            line_gt_text = os.path.join(output_directory, f'{
training_text_file_name}_{line_serial}.gt.txt')
            with open(line_gt_text, 'w') as output_file:
                output_file.writelines([line])
            
            # Image filename
            file_base_name = f'ben_{line_serial}'  # Unique filename for 
each font
            subprocess.run([
                'text2image',
                f'--font={font}',
                f'--text={line_gt_text}',
                f'--outputbase={output_directory}/{file_base_name}',
                '--max_pages=1',
                '--strip_unrenderable_words',
                '--leading=36',
                '--xsize=3600',
                '--ysize=350',
                '--char_spacing=1.0',
                '--exposure=0',
                '--unicharset_file=langdata/ben.unicharset',
            ])
            
            line_count += 1
            font_serial += 1
        
        # Reset font_serial for the next font iteration
        font_serial = 1
    
    write_line_count(line_count)  # Update the line_count in the file

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--start', type=int, help='Starting line count 
(inclusive)')
    parser.add_argument('--end', type=int, help='Ending line count 
(inclusive)')
    args = parser.parse_args()
    
    training_text_file = 'langdata/ben.training_text'
    output_directory = 'tesstrain/data/ben-ground-truth'
    
    # Create an instance of the FontList class
    font_list = FontList()
     
    create_training_data(training_text_file, font_list, output_directory, 
args.start, args.end)


*and for training code:*

import subprocess

# List of font names
font_names = ['ben']

for font in font_names:
    command = f"TESSDATA_PREFIX=../tesseract/tessdata make training 
MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata 
MAX_ITERATIONS=10000 LANG_TYPE=Indic"
    subprocess.run(command, shell=True)


any suggestion to identify to extract the problem.
thanks everyone







-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/6e894d47-02f3-41d4-bf41-7c9e63db656bn%40googlegroups.com.

[tesseract-ocr] the unexpected result after trained data

Reply via email to