I have trained some new fonts by fine-tune methods for the Bengali language
in Tesseract 5 and I have used all official trained_text and other things
also. everything is good but the problem is the default font which was
trained before that does not convert text like prev but my new fonts work
well. I don't understand why it's happening. I share code based to
understand what going on.
*codes for creating tif, gt.txt, .box files:*
import os
import random
import pathlib
import subprocess
import argparse
from FontList import FontList
def read_line_count():
if os.path.exists('line_count.txt'):
with open('line_count.txt', 'r') as file:
return int(file.read())
return 0
def write_line_count(line_count):
with open('line_count.txt', 'w') as file:
file.write(str(line_count))
def create_training_data(training_text_file, font_list, output_directory,
start_line=None, end_line=None):
lines = []
with open(training_text_file, 'r') as input_file:
for line in input_file.readlines():
lines.append(line.strip())
if not os.path.exists(output_directory):
os.mkdir(output_directory)
random.shuffle(lines)
if start_line is None:
line_count = read_line_count() # Set the starting line_count from
the file
else:
line_count = start_line
if end_line is None:
end_line_count = len(lines) - 1 # Set the ending line_count
else:
end_line_count = min(end_line, len(lines) - 1)
for font in font_list.fonts: # Iterate through all the fonts in the
font_list
font_serial = 1
for line in lines:
training_text_file_name = pathlib.Path(training_text_file).stem
# Generate a unique serial number for each line
line_serial = f"{line_count:d}"
# GT (Ground Truth) text filename
line_gt_text = os.path.join(output_directory, f'{
training_text_file_name}_{line_serial}.gt.txt')
with open(line_gt_text, 'w') as output_file:
output_file.writelines([line])
# Image filename
file_base_name = f'ben_{line_serial}' # Unique filename for
each font
subprocess.run([
'text2image',
f'--font={font}',
f'--text={line_gt_text}',
f'--outputbase={output_directory}/{file_base_name}',
'--max_pages=1',
'--strip_unrenderable_words',
'--leading=36',
'--xsize=3600',
'--ysize=350',
'--char_spacing=1.0',
'--exposure=0',
'--unicharset_file=langdata/ben.unicharset',
])
line_count += 1
font_serial += 1
# Reset font_serial for the next font iteration
font_serial = 1
write_line_count(line_count) # Update the line_count in the file
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--start', type=int, help='Starting line count
(inclusive)')
parser.add_argument('--end', type=int, help='Ending line count
(inclusive)')
args = parser.parse_args()
training_text_file = 'langdata/ben.training_text'
output_directory = 'tesstrain/data/ben-ground-truth'
# Create an instance of the FontList class
font_list = FontList()
create_training_data(training_text_file, font_list, output_directory,
args.start, args.end)
*and for training code:*
import subprocess
# List of font names
font_names = ['ben']
for font in font_names:
command = f"TESSDATA_PREFIX=../tesseract/tessdata make training
MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata
MAX_ITERATIONS=10000 LANG_TYPE=Indic"
subprocess.run(command, shell=True)
any suggestion to identify to extract the problem.
thanks everyone
--
You received this message because you are subscribed to the Google Groups
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/tesseract-ocr/6e894d47-02f3-41d4-bf41-7c9e63db656bn%40googlegroups.com.