My goal is to automate model training in tesseract OCR for Japanese words.
The user should just paste ground truth files and picture files into a
particular folder, and then use that data to train a new model. this
process should be able to be carried out multiple times. Every single time
data is added to the folder I expect an automated model training.
However, this is the error that i run into when I try to run automated
tesseract training on VSCode. What I did is that I had a script that uses
watchdog to detect newly added .tif/.png files alongside their
corresponding .gt.txt files into a particular folder (from which the model
is supposed to treat as training data and use it to train). The watcher
file looks something like this:
(watcher_trainng.py)
import time
import os
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from pathlib import Path
from training.tesseract_training import run_tesseract_training
from training.training_model_utils import get_latest_and_next_model
WATCHED_FOLDER = r"C:\Users\Chan Jian Sen\Documents\ocr-japanese\I
NPUT_TRAINING_DATA" #ground truth put here
tesstrain_dir = r"C:\Users\Chan Jian Sen\Documents\TesseractFineTuningJpn5\t
esstrain"
class TrainingInputHandler(FileSystemEventHandler):
def on_modified(self, event):
self.check_and_trigger_training()
def on_created(self, event):
self.check_and_trigger_training()
def check_and_trigger_training(self):
files = os.listdir(WATCHED_FOLDER)
pngs = {Path(f).stem for f in files if f.endswith('.png')}
gts = {Path(f).stem for f in files if f.endswith('.gt.txt')}
common = pngs & gts
if len(common) == 0:
print("⏳ Waiting for matching .png and .gt.txt pairs...")
tessdata_path = r"C:\Users\Chan Jian Sen\Documents\T
esseractFineTuningJpn5\tessdata"
start_model, new_model = get_latest_and_next_model(tessdata_path)
print(f"🔁 Using {start_model} as base, training new model: {
new_model}") #problem here is the the old model they saw it as jpn and the
new model as jpn1
run_tesseract_training(tesstrain_dir, new_model, start_model) #the
first parameter MUST be your tesstrain folder
observer.stop()
if __name__ == "__main__":
print(f"👀 Watching training data folder: {WATCHED_FOLDER}")
event_handler = TrainingInputHandler()
observer = Observer()
observer.schedule(event_handler, WATCHED_FOLDER, recursive=False)
observer.start()
try:
while observer.is_alive():
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
To generate a new model name (since I want to automate model training), i
also have these functions here:
(training_model_utils.py)
import os
def get_model_names(tessdata_path, model_prefix="jpn"):
models = []
for fname in os.listdir(tessdata_path):
if fname.startswith(model_prefix) and fname.endswith(".traineddata"
):
suffix = fname[len(model_prefix):-len(".traineddata")]
if suffix == "":
models.append((0, "jpn"))
elif suffix.isdigit():
models.append((int(suffix), f"{model_prefix}{suffix}"))
models.sort()
return models
def get_latest_and_next_model(tessdata_path, model_prefix="jpn"):
models = get_model_names(tessdata_path, model_prefix)
if not models:
return model_prefix, f"{model_prefix}2"
latest = models[-1][1]
next_num = models[-1][0] + 1
next_model = f"{model_prefix}{next_num}" if next_num > 0 else f"{
model_prefix}2"
return latest, next_model
I also coded the make training procedure into VSCode, with a python script
that calls for it. This code snippet below is meant to run the tesseract
training.
(tesseract_training.py)
import subprocess
import os
def run_tesseract_training(training_dir, model_name, start_model,
max_iterations=4000): #previously start model is jpn
"""
Run the full Tesseract tesstrain workflow including unicharset and
langdata.
"""
tessdata_path = r"C:\Users\Chan Jian Sen\Documents\T
esseractFineTuningJpn5\tessdata"
# Important: replace backslashes with forward slashes
tessdata_path = tessdata_path.replace("\\", "/")
env = os.environ.copy()
env["TESSDATA_PREFIX"] = tessdata_path
command = [
"make",
"unicharset", "lists", "proto-model", "tesseract-langdata",
"training",
f"MODEL_NAME={model_name}",
f"START_MODEL={start_model}",
f"TESSDATA={tessdata_path}", # Adjust path depending on where your
.traineddata are
f"GROUND_TRUTH_DIR={training_dir}",
f"MAX_ITERATIONS={max_iterations}",
"LEARNING_RATE=0.001"
]
print("🚀 Running full Tesseract training pipeline...")
try:
subprocess.run(command, cwd=r"C:\Users\Chan Jian Sen\Documents\T
esseractFineTuningJpn5\tesstrain", shell=True, check=True, env=env)
print(f"✅ Training complete: {model_name}.traineddata generated.")
except subprocess.CalledProcessError as e:
print(f"❌ Training failed: {e}")
However this is my terminal output when I run the watcher file.
PS C:\Users\Chan Jian Sen\Documents\ocr-japanese> c:; cd 'c:\Users\Chan
Jian Sen\Documents\ocr-japanese'; & 'c:\Users\Chan Jian
Sen\AppData\Local\Programs\Python\Python39\python.exe' 'c:\Users\Chan Jian
Sen\.vscode\extensions\ms-python.debugpy-2025.6.0-win32-x64\bundled\libs\debugpy\launcher'
'56444' '--' 'C:\Users\Chan Jian
Sen\Documents\ocr-japanese\watcher_training.py'
gpy-2025.6.0-win32-x64\x5cbundled\x5clibs\x5cdebugpy\x5clauncher' '56444'
'--' 'C:\x5cUsers\x5cChan Jian
Sen\x5cDocuments\x5cocr-japanese\x5cwatcher_training.py'
;0a5d0c8e-f6f4-44db-b1ea-a49791670afe👀 Watching training data folder:
C:\Users\Chan Jian Sen\Documents\ocr-japanese\INPUT_TRAINING_DATA
⏳ Waiting for matching .png and .gt.txt pairs...
🔁 Using jpn as base, training new model: jpn1
🚀 Running full Tesseract training pipeline...
You are using make version: 4.4.1
Makefile:438: *** mixed implicit and normal rules: deprecated syntax
combine_tessdata -u C:/Users/Chan Jian
Sen/Documents/TesseractFineTuningJpn5/tessdata/jpn.traineddata data/jpn/jpn1
Failed to read C:/Users/Chan
make: *** [Makefile:207: data/jpn/jpn1.lstm-unicharset] Error 1
Would greatly appreciate for any help given. Sorry if it's quite a lot to
digest.
--
You received this message because you are subscribed to the Google Groups
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion visit
https://groups.google.com/d/msgid/tesseract-ocr/a29346d2-7d51-41e8-8a44-6de99c714b3fn%40googlegroups.com.