and one more thing I have used langdata_best.
On Wednesday, 9 August, 2023 at 11:39:21 pm UTC+6 Ali hussain wrote:
> I have trained some new fonts by fine-tune methods for the Bengali
> language in Tesseract 5 and I have used all official trained_text and other
> things also. everything is good but the problem is the default font which
> was trained before that does not convert text like prev but my new fonts
> work well. I don't understand why it's happening. I share code based to
> understand what going on.
>
>
> *codes for creating tif, gt.txt, .box files:*
> import os
> import random
> import pathlib
> import subprocess
> import argparse
> from FontList import FontList
>
> def read_line_count():
> if os.path.exists('line_count.txt'):
> with open('line_count.txt', 'r') as file:
> return int(file.read())
> return 0
>
> def write_line_count(line_count):
> with open('line_count.txt', 'w') as file:
> file.write(str(line_count))
>
> def create_training_data(training_text_file, font_list, output_directory,
> start_line=None, end_line=None):
> lines = []
> with open(training_text_file, 'r') as input_file:
> for line in input_file.readlines():
> lines.append(line.strip())
>
> if not os.path.exists(output_directory):
> os.mkdir(output_directory)
>
> random.shuffle(lines)
>
> if start_line is None:
> line_count = read_line_count() # Set the starting line_count
> from the file
> else:
> line_count = start_line
>
> if end_line is None:
> end_line_count = len(lines) - 1 # Set the ending line_count
> else:
> end_line_count = min(end_line, len(lines) - 1)
>
> for font in font_list.fonts: # Iterate through all the fonts in the
> font_list
> font_serial = 1
> for line in lines:
> training_text_file_name = pathlib.Path(training_text_file
> ).stem
>
> # Generate a unique serial number for each line
> line_serial = f"{line_count:d}"
>
> # GT (Ground Truth) text filename
> line_gt_text = os.path.join(output_directory, f'{
> training_text_file_name}_{line_serial}.gt.txt')
> with open(line_gt_text, 'w') as output_file:
> output_file.writelines([line])
>
> # Image filename
> file_base_name = f'ben_{line_serial}' # Unique filename for
> each font
> subprocess.run([
> 'text2image',
> f'--font={font}',
> f'--text={line_gt_text}',
> f'--outputbase={output_directory}/{file_base_name}',
> '--max_pages=1',
> '--strip_unrenderable_words',
> '--leading=36',
> '--xsize=3600',
> '--ysize=350',
> '--char_spacing=1.0',
> '--exposure=0',
> '--unicharset_file=langdata/ben.unicharset',
> ])
>
> line_count += 1
> font_serial += 1
>
> # Reset font_serial for the next font iteration
> font_serial = 1
>
> write_line_count(line_count) # Update the line_count in the file
>
> if __name__ == "__main__":
> parser = argparse.ArgumentParser()
> parser.add_argument('--start', type=int, help='Starting line count
> (inclusive)')
> parser.add_argument('--end', type=int, help='Ending line count
> (inclusive)')
> args = parser.parse_args()
>
> training_text_file = 'langdata/ben.training_text'
> output_directory = 'tesstrain/data/ben-ground-truth'
>
> # Create an instance of the FontList class
> font_list = FontList()
>
> create_training_data(training_text_file, font_list, output_directory,
> args.start, args.end)
>
>
> *and for training code:*
>
> import subprocess
>
> # List of font names
> font_names = ['ben']
>
> for font in font_names:
> command = f"TESSDATA_PREFIX=../tesseract/tessdata make training
> MODEL_NAME={font} START_MODEL=ben TESSDATA=../tesseract/tessdata
> MAX_ITERATIONS=10000 LANG_TYPE=Indic"
> subprocess.run(command, shell=True)
>
>
> any suggestion to identify to extract the problem.
> thanks everyone
>
>
>
>
>
>
>
>
--
You received this message because you are subscribed to the Google Groups
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/tesseract-ocr/a8422cae-bfd9-4e29-b8d5-3a9ac9fe623fn%40googlegroups.com.