I modified the makefile for ocrd-train to do fine-tuning. It is pasted below:
export SHELL := /bin/bash LOCAL := $(PWD)/usr PATH := $(LOCAL)/bin:$(PATH) HOME := /home/ubuntu TESSDATA = $(HOME)/tessdata_best LANGDATA = $(HOME)/langdata # Name of the model to be built MODEL_NAME = frk # Name of the model to continue from CONTINUE_FROM = frk # Normalization Mode - see src/training/language_specific.sh for details NORM_MODE = 2 # Tesseract model repo to use. Default: $(TESSDATA_REPO) TESSDATA_REPO = _best # Train directory TRAIN := data/train # BEGIN-EVAL makefile-parser --make-help Makefile help: @echo "" @echo " Targets" @echo "" @echo " unicharset Create unicharset" @echo " lists Create lists of lstmf filenames for training and eval" @echo " training Start training" @echo " proto-model Build the proto model" @echo " leptonica Build leptonica" @echo " tesseract Build tesseract" @echo " tesseract-langs Download tesseract-langs" @echo " langdata Download langdata" @echo " clean Clean all generated files" @echo "" @echo " Variables" @echo "" @echo " MODEL_NAME Name of the model to be built" @echo " CORES No of cores to use for compiling leptonica/tesseract" @echo " LEPTONICA_VERSION Leptonica version. Default: $(LEPTONICA_VERSION)" @echo " TESSERACT_VERSION Tesseract commit. Default: $(TESSERACT_VERSION)" @echo " LANGDATA_VERSION Tesseract langdata version. Default: $(LANGDATA_VERSION)" @echo " TESSDATA_REPO Tesseract model repo to use. Default: $(TESSDATA_REPO)" @echo " TRAIN Train directory" @echo " RATIO_TRAIN Ratio of train / eval training data" # END-EVAL # Ratio of train / eval training data RATIO_TRAIN := 0.90 ALL_BOXES = data/all-boxes ALL_LSTMF = data/all-lstmf # Create unicharset unicharset: data/unicharset # Create lists of lstmf filenames for training and eval lists: $(ALL_LSTMF) data/list.train data/list.eval data/list.train: $(ALL_LSTMF) total=`cat $(ALL_LSTMF) | wc -l` \ no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \ head -n "$$no" $(ALL_LSTMF) > "$@" data/list.eval: $(ALL_LSTMF) total=`cat $(ALL_LSTMF) | wc -l` \ no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \ tail -n "+$$no" $(ALL_LSTMF) > "$@" # Start training training: data/$(MODEL_NAME).traineddata data/unicharset: $(ALL_BOXES) combine_tessdata -u $(TESSDATA)/$(CONTINUE_FROM).traineddata $(TESSDATA)/$(CONTINUE_FROM). unicharset_extractor --output_unicharset "$(TRAIN)/my.unicharset" --norm_mode $(NORM_MODE) "$(ALL_BOXES)" merge_unicharsets $(TESSDATA)/$(CONTINUE_FROM).lstm-unicharset $(TRAIN)/my.unicharset "$@" $(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(TRAIN)/*.tif))) find $(TRAIN) -name '*.box' -exec cat {} \; > "$@" $(TRAIN)/%.box: $(TRAIN)/%.tif $(TRAIN)/%-gt.txt python generate_line_box.py -i "$(TRAIN)/$*.tif" -t "$(TRAIN)/$*-gt.txt" > "$@" $(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(TRAIN)/*.tif))) find $(TRAIN) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@" $(TRAIN)/%.lstmf: $(TRAIN)/%.box tesseract $(TRAIN)/$*.tif $(TRAIN)/$* --psm 6 lstm.train # Build the proto model proto-model: data/$(MODEL_NAME)/$(MODEL_NAME).traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata: $(LANGDATA) data/unicharset combine_lang_model \ --input_unicharset data/unicharset \ --script_dir $(LANGDATA) \ --words $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).wordlist \ --numbers $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).numbers \ --puncs $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).punc \ --output_dir data/ \ --lang $(MODEL_NAME) data/checkpoints/$(MODEL_NAME)_checkpoint: unicharset lists proto-model mkdir -p data/checkpoints lstmtraining \ --continue_from $(TESSDATA)/$(CONTINUE_FROM).lstm \ --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \ --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \ --model_output data/checkpoints/$(MODEL_NAME) \ --debug_interval -1 \ --train_listfile data/list.train \ --eval_listfile data/list.eval \ --sequential_training \ --max_iterations 3000 data/$(MODEL_NAME).traineddata: data/checkpoints/$(MODEL_NAME)_checkpoint lstmtraining \ --stop_training \ --continue_from $^ \ --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \ --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \ --model_output $@ # Clean all generated files clean: find data/train -name '*.box' -delete find data/train -name '*.lstmf' -delete rm -rf data/all-* rm -rf data/list.* rm -rf data/$(MODEL_NAME) rm -rf data/unicharset rm -rf data/checkpoints On Fri, Jun 29, 2018 at 5:31 PM Lorenzo Bolzani <l.bolz...@gmail.com> wrote: > > > Hi, > I'm trying to do fine tuning of an existing model using line images and > text labels. I'm running this version: > > tesseract 4.0.0-beta.3-56-g5fda > leptonica-1.76.0 > libgif 5.1.4 : libjpeg 8d (libjpeg-turbo 1.4.2) : libpng 1.2.54 : > libtiff 4.0.6 : zlib 1.2.8 : libwebp 0.4.4 : libopenjp2 2.3.0 > Found AVX2 > Found AVX > Found SSE > > > > I used OCR-D to generate lstmf files for the demo data. > > If I run the make command it works fine. > > make training MODEL_NAME=prova > > Now I isolated this command from the build: > > lstmtraining \ > --traineddata data/prova/prova.traineddata \ > --net_spec "[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c`head > -n1 data/unicharset`]" \ > --model_output data/checkpoints/prova \ > --learning_rate 20e-4 \ > --train_listfile data/list.train \ > --eval_listfile data/list.eval \ > --max_iterations 10000 > > and it works fine. > > Now I'm trying to modify it to fine tune the existing eng model. I made a > few attempts, all ending into different errors (see the attached file for > full output). > > I used: > > combine_tessdata -e /usr/local/share/tessdata/eng.traineddata > extracted/eng.lstm > > to extract the eng.lstm model. > > This seems to works but I'm not sure it is the correct. > > lstmtraining \ > --continue_from extracted/eng.lstm \ > --traineddata data/prova/prova.traineddata \ > --old_traineddata extracted/eng.traineddata \ > --model_output data/checkpoints/prova \ > --learning_rate 20e-4 \ > --train_listfile data/list.train \ > --eval_listfile data/list.eval \ > --max_iterations 10000 > > (extracted/eng.traineddata is just a copy of eng.traineddata) > > > The training resume exactly with the RMS of prova_checkpoint (6%) so it > looks like it is training from that checkpoint, not the eng.lstm. > > Is this correct? What should I change? > > I'm following this guide: > > > https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for--a-few-characters > > > I think continue_from and traineddata should refer to the eng model and > old_traineddata should point to prova.traineddata, but if I do that I get a > segmentation fault: > > [...] > !int_mode_:Error:Assert failed:in file weightmatrix.cpp, line 244 > !int_mode_:Error:Assert failed:in file weightmatrix.cpp, line 244 > Segmentation fault > > What am I missing? > > > Thanks, bye > > Lorenzo > > -- > You received this message because you are subscribed to the Google Groups > "tesseract-ocr" group. > To unsubscribe from this group and stop receiving emails from it, send an > email to tesseract-ocr+unsubscr...@googlegroups.com. > To post to this group, send email to tesseract-ocr@googlegroups.com. > Visit this group at https://groups.google.com/group/tesseract-ocr. > To view this discussion on the web visit > https://groups.google.com/d/msgid/tesseract-ocr/CAMgOLLyOJN31PdWQumXPO3JjuAc1Yz2BZYpMd4ftzBHgZkEaxA%40mail.gmail.com > <https://groups.google.com/d/msgid/tesseract-ocr/CAMgOLLyOJN31PdWQumXPO3JjuAc1Yz2BZYpMd4ftzBHgZkEaxA%40mail.gmail.com?utm_medium=email&utm_source=footer> > . > For more options, visit https://groups.google.com/d/optout. > -- ____________________________________________________________ भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com -- You received this message because you are subscribed to the Google Groups "tesseract-ocr" group. To unsubscribe from this group and stop receiving emails from it, send an email to tesseract-ocr+unsubscr...@googlegroups.com. To post to this group, send email to tesseract-ocr@googlegroups.com. Visit this group at https://groups.google.com/group/tesseract-ocr. To view this discussion on the web visit https://groups.google.com/d/msgid/tesseract-ocr/CAG2NduWe%3Dv9YvYAMTAzm9yNEFFtqjnxBVGDe9x4tQd1Pnjiwqw%40mail.gmail.com. For more options, visit https://groups.google.com/d/optout.