Tesseract Version: 4.1.0
I am trying to fine tune tesseract on custom dataset with the following Makefile: export SHELL := /bin/bash HOME := $(PWD) TESSDATA = $(HOME)/tessdata LANGDATA = $(HOME)/langdata # Train directory # TRAIN := $(HOME)/train_data TRAIN := /media/vimaan/Data/OCR/tesseract_train # Name of the model to be built MODEL_NAME = eng LANG_CODE = eng # Name of the model to continue from CONTINUE_FROM = eng TESSDATA_REPO = _best # Normalization Mode - see src/training/language_specific.sh for details NORM_MODE = 1 # BEGIN-EVAL makefile-parser --make-help Makefile help: @echo "" @echo " Targets" @echo "" @echo " unicharset Create unicharset" @echo " lists Create lists of lstmf filenames for training and eval" @echo " training Start training" @echo " proto-model Build the proto model" @echo " leptonica Build leptonica" @echo " tesseract Build tesseract" @echo " tesseract-langs Download tesseract-langs" @echo " langdata Download langdata" @echo " clean Clean all generated files" @echo "" @echo " Variables" @echo "" @echo " MODEL_NAME Name of the model to be built" @echo " CORES No of cores to use for compiling leptonica/tesseract" @echo " LEPTONICA_VERSION Leptonica version. Default: $(LEPTONICA_VERSION)" @echo " TESSERACT_VERSION Tesseract commit. Default: $(TESSERACT_VERSION)" @echo " LANGDATA_VERSION Tesseract langdata version. Default: $(LANGDATA_VERSION)" @echo " TESSDATA_REPO Tesseract model repo to use. Default: $(TESSDATA_REPO)" @echo " TRAIN Train directory" @echo " RATIO_TRAIN Ratio of train / eval training data" # END-EVAL # Ratio of train / eval training data RATIO_TRAIN := 0.90 ALL_BOXES = data/all-boxes ALL_LSTMF = data/all-lstmf # Create unicharset unicharset: data/unicharset # Create lists of lstmf filenames for training and eval #lists: $(ALL_LSTMF) data/list.train data/list.eval lists: $(ALL_LSTMF) train-lists: data/list.train data/list.eval data/list.train: $(ALL_LSTMF) total=`cat $(ALL_LSTMF) | wc -l` \ no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \ head -n "$$no" $(ALL_LSTMF) > "$@" data/list.eval: $(ALL_LSTMF) total=`cat $(ALL_LSTMF) | wc -l` \ no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \ tail -n "$$no" $(ALL_LSTMF) > "$@" # Start training training: data/$(MODEL_NAME).traineddata data/unicharset: $(ALL_BOXES) mkdir -p data/$(START_MODEL) combine_tessdata -u $(TESSDATA)/$(CONTINUE_FROM).traineddata $(TESSDATA)/$(CONTINUE_FROM). unicharset_extractor --output_unicharset "$(TRAIN)/my.unicharset" --norm_mode $(NORM_MODE) "$(ALL_BOXES)" #merge_unicharsets data/$(START_MODEL)/$(START_MODEL).lstm-unicharset $(GROUND_TRUTH_DIR)/my.unicharset "$@" merge_unicharsets $(TESSDATA)/$(CONTINUE_FROM).lstm-unicharset $(TRAIN)/my.unicharset "$@" $(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(TRAIN)/*.tif))) find $(TRAIN) -name '*.box' -exec cat {} \; > "$@" $(TRAIN)/%.box: $(TRAIN)/%.tif $(TRAIN)/%.gt.txt python generate_line_box.py -i "$(TRAIN)/$*.tif" -t "$(TRAIN)/$*.gt.txt" > "$@" $(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(TRAIN)/*.tif))) find $(TRAIN) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@" $(TRAIN)/%.lstmf: $(TRAIN)/%.box tesseract $(TRAIN)/$*.tif $(TRAIN)/$* --dpi 300 --psm 7 lstm.train # Build the proto model proto-model: data/$(MODEL_NAME)/$(MODEL_NAME).traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata: $(LANGDATA) data/unicharset combine_lang_model \ --input_unicharset data/unicharset \ --script_dir $(LANGDATA) \ --words $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).wordlist \ --numbers $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).numbers \ --puncs $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).punc \ --output_dir data/ \ --lang $(MODEL_NAME) data/checkpoints/$(MODEL_NAME)_checkpoint: unicharset proto-model mkdir -p data/checkpoints lstmtraining \ --continue_from $(TESSDATA)/$(CONTINUE_FROM).lstm \ --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \ --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \ --model_output data/checkpoints/$(MODEL_NAME) \ --debug_interval -1 \ --train_listfile data/list.train \ --eval_listfile data/list.eval \ --sequential_training \ --max_iterations 170000 data/$(MODEL_NAME).traineddata: data/checkpoints/$(MODEL_NAME)_checkpoint lstmtraining \ --stop_training \ --continue_from $^ \ --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \ --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \ --model_output $@ # Clean all generated files clean: find data/train -name '*.box' -delete find data/train -name '*.lstmf' -delete rm -rf data/all-* rm -rf data/list.* rm -rf data/$(MODEL_NAME) rm -rf data/unicharset rm -rf data/checkpoints The number of .lstmf files being generated is significantly lower than .box files being generated. For eg: Number of .tif files: 10k Number of .gt.txt files: 10k Number of .box files: 10k Number of .lstmf files: 8k. Could anyone point me out to the possible reasons for this issue On Friday, June 29, 2018 at 5:39:09 PM UTC+5:30, shree wrote: > > I modified the makefile for ocrd-train to do fine-tuning. It is pasted > below: > > export > > SHELL := /bin/bash > LOCAL := $(PWD)/usr > PATH := $(LOCAL)/bin:$(PATH) > HOME := /home/ubuntu > TESSDATA = $(HOME)/tessdata_best > LANGDATA = $(HOME)/langdata > > # Name of the model to be built > MODEL_NAME = frk > > # Name of the model to continue from > CONTINUE_FROM = frk > > # Normalization Mode - see src/training/language_specific.sh for details > NORM_MODE = 2 > > # Tesseract model repo to use. Default: $(TESSDATA_REPO) > TESSDATA_REPO = _best > > # Train directory > TRAIN := data/train > > # BEGIN-EVAL makefile-parser --make-help Makefile > > help: > @echo "" > @echo " Targets" > @echo "" > @echo " unicharset Create unicharset" > @echo " lists Create lists of lstmf filenames for training > and eval" > @echo " training Start training" > @echo " proto-model Build the proto model" > @echo " leptonica Build leptonica" > @echo " tesseract Build tesseract" > @echo " tesseract-langs Download tesseract-langs" > @echo " langdata Download langdata" > @echo " clean Clean all generated files" > @echo "" > @echo " Variables" > @echo "" > @echo " MODEL_NAME Name of the model to be built" > @echo " CORES No of cores to use for compiling > leptonica/tesseract" > @echo " LEPTONICA_VERSION Leptonica version. Default: > $(LEPTONICA_VERSION)" > @echo " TESSERACT_VERSION Tesseract commit. Default: > $(TESSERACT_VERSION)" > @echo " LANGDATA_VERSION Tesseract langdata version. Default: > $(LANGDATA_VERSION)" > @echo " TESSDATA_REPO Tesseract model repo to use. Default: > $(TESSDATA_REPO)" > @echo " TRAIN Train directory" > @echo " RATIO_TRAIN Ratio of train / eval training data" > > # END-EVAL > > # Ratio of train / eval training data > RATIO_TRAIN := 0.90 > > ALL_BOXES = data/all-boxes > ALL_LSTMF = data/all-lstmf > > # Create unicharset > unicharset: data/unicharset > > # Create lists of lstmf filenames for training and eval > lists: $(ALL_LSTMF) data/list.train data/list.eval > > data/list.train: $(ALL_LSTMF) > total=`cat $(ALL_LSTMF) | wc -l` \ > no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \ > head -n "$$no" $(ALL_LSTMF) > "$@" > > data/list.eval: $(ALL_LSTMF) > total=`cat $(ALL_LSTMF) | wc -l` \ > no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \ > tail -n "+$$no" $(ALL_LSTMF) > "$@" > > # Start training > training: data/$(MODEL_NAME).traineddata > > data/unicharset: $(ALL_BOXES) > combine_tessdata -u $(TESSDATA)/$(CONTINUE_FROM).traineddata > $(TESSDATA)/$(CONTINUE_FROM). > unicharset_extractor --output_unicharset "$(TRAIN)/my.unicharset" > --norm_mode $(NORM_MODE) "$(ALL_BOXES)" > merge_unicharsets $(TESSDATA)/$(CONTINUE_FROM).lstm-unicharset > $(TRAIN)/my.unicharset "$@" > $(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(TRAIN)/*.tif))) > find $(TRAIN) -name '*.box' -exec cat {} \; > "$@" > $(TRAIN)/%.box: $(TRAIN)/%.tif $(TRAIN)/%-gt.txt > python generate_line_box.py -i "$(TRAIN)/$*.tif" -t "$(TRAIN)/$*-gt.txt" > > "$@" > > $(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(TRAIN)/*.tif))) > find $(TRAIN) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@" > > $(TRAIN)/%.lstmf: $(TRAIN)/%.box > tesseract $(TRAIN)/$*.tif $(TRAIN)/$* --psm 6 lstm.train > > # Build the proto model > proto-model: data/$(MODEL_NAME)/$(MODEL_NAME).traineddata > > data/$(MODEL_NAME)/$(MODEL_NAME).traineddata: $(LANGDATA) data/unicharset > combine_lang_model \ > --input_unicharset data/unicharset \ > --script_dir $(LANGDATA) \ > --words $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).wordlist \ > --numbers $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).numbers \ > --puncs $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).punc \ > --output_dir data/ \ > --lang $(MODEL_NAME) > > data/checkpoints/$(MODEL_NAME)_checkpoint: unicharset lists proto-model > mkdir -p data/checkpoints > lstmtraining \ > --continue_from $(TESSDATA)/$(CONTINUE_FROM).lstm \ > --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \ > --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \ > --model_output data/checkpoints/$(MODEL_NAME) \ > --debug_interval -1 \ > --train_listfile data/list.train \ > --eval_listfile data/list.eval \ > --sequential_training \ > --max_iterations 3000 > > data/$(MODEL_NAME).traineddata: data/checkpoints/$(MODEL_NAME)_checkpoint > lstmtraining \ > --stop_training \ > --continue_from $^ \ > --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \ > --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \ > --model_output $@ > > # Clean all generated files > clean: > find data/train -name '*.box' -delete > find data/train -name '*.lstmf' -delete > rm -rf data/all-* > rm -rf data/list.* > rm -rf data/$(MODEL_NAME) > rm -rf data/unicharset > rm -rf data/checkpoints > > On Fri, Jun 29, 2018 at 5:31 PM Lorenzo Bolzani <l.bo...@gmail.com > <javascript:>> wrote: > >> >> >> Hi, >> I'm trying to do fine tuning of an existing model using line images and >> text labels. I'm running this version: >> >> tesseract 4.0.0-beta.3-56-g5fda >> leptonica-1.76.0 >> libgif 5.1.4 : libjpeg 8d (libjpeg-turbo 1.4.2) : libpng 1.2.54 : >> libtiff 4.0.6 : zlib 1.2.8 : libwebp 0.4.4 : libopenjp2 2.3.0 >> Found AVX2 >> Found AVX >> Found SSE >> >> >> >> I used OCR-D to generate lstmf files for the demo data. >> >> If I run the make command it works fine. >> >> make training MODEL_NAME=prova >> >> Now I isolated this command from the build: >> >> lstmtraining \ >> --traineddata data/prova/prova.traineddata \ >> --net_spec "[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c`head >> -n1 data/unicharset`]" \ >> --model_output data/checkpoints/prova \ >> --learning_rate 20e-4 \ >> --train_listfile data/list.train \ >> --eval_listfile data/list.eval \ >> --max_iterations 10000 >> >> and it works fine. >> >> Now I'm trying to modify it to fine tune the existing eng model. I made a >> few attempts, all ending into different errors (see the attached file for >> full output). >> >> I used: >> >> combine_tessdata -e /usr/local/share/tessdata/eng.traineddata >> extracted/eng.lstm >> >> to extract the eng.lstm model. >> >> This seems to works but I'm not sure it is the correct. >> >> lstmtraining \ >> --continue_from extracted/eng.lstm \ >> --traineddata data/prova/prova.traineddata \ >> --old_traineddata extracted/eng.traineddata \ >> --model_output data/checkpoints/prova \ >> --learning_rate 20e-4 \ >> --train_listfile data/list.train \ >> --eval_listfile data/list.eval \ >> --max_iterations 10000 >> >> (extracted/eng.traineddata is just a copy of eng.traineddata) >> >> >> The training resume exactly with the RMS of prova_checkpoint (6%) so it >> looks like it is training from that checkpoint, not the eng.lstm. >> >> Is this correct? What should I change? >> >> I'm following this guide: >> >> >> https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for--a-few-characters >> >> >> I think continue_from and traineddata should refer to the eng model and >> old_traineddata should point to prova.traineddata, but if I do that I get a >> segmentation fault: >> >> [...] >> !int_mode_:Error:Assert failed:in file weightmatrix.cpp, line 244 >> !int_mode_:Error:Assert failed:in file weightmatrix.cpp, line 244 >> Segmentation fault >> >> What am I missing? >> >> >> Thanks, bye >> >> Lorenzo >> >> -- >> You received this message because you are subscribed to the Google Groups >> "tesseract-ocr" group. >> To unsubscribe from this group and stop receiving emails from it, send an >> email to tesser...@googlegroups.com <javascript:>. >> To post to this group, send email to tesser...@googlegroups.com >> <javascript:>. >> Visit this group at https://groups.google.com/group/tesseract-ocr. >> To view this discussion on the web visit >> https://groups.google.com/d/msgid/tesseract-ocr/CAMgOLLyOJN31PdWQumXPO3JjuAc1Yz2BZYpMd4ftzBHgZkEaxA%40mail.gmail.com >> >> <https://groups.google.com/d/msgid/tesseract-ocr/CAMgOLLyOJN31PdWQumXPO3JjuAc1Yz2BZYpMd4ftzBHgZkEaxA%40mail.gmail.com?utm_medium=email&utm_source=footer> >> . >> For more options, visit https://groups.google.com/d/optout. >> > > > -- > > ____________________________________________________________ > भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com > -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/e3ba3b90-a8c8-4085-bec5-cf918034ba2a%40googlegroups.com.