Re: [tesseract-ocr] Fine tuning existing model

Ayush Pandey Thu, 05 Sep 2019 00:55:21 -0700


Tesseract Version: 4.1.0


I am trying to fine tune tesseract on custom dataset with the following 
Makefile:

export

SHELL := /bin/bash
HOME := $(PWD)
TESSDATA = $(HOME)/tessdata
LANGDATA = $(HOME)/langdata

# Train directory
# TRAIN := $(HOME)/train_data
TRAIN := /media/vimaan/Data/OCR/tesseract_train

# Name of the model to be built
MODEL_NAME = eng
LANG_CODE = eng

# Name of the model to continue from
CONTINUE_FROM = eng

TESSDATA_REPO = _best

# Normalization Mode - see src/training/language_specific.sh for details 
NORM_MODE = 1

# BEGIN-EVAL makefile-parser --make-help Makefile

help:
        @echo ""
        @echo "  Targets"
        @echo ""
        @echo "    unicharset       Create unicharset"
        @echo "    lists            Create lists of lstmf filenames for 
training and eval"
        @echo "    training         Start training"
        @echo "    proto-model      Build the proto model"
        @echo "    leptonica        Build leptonica"
        @echo "    tesseract        Build tesseract"
        @echo "    tesseract-langs  Download tesseract-langs"
        @echo "    langdata         Download langdata"
        @echo "    clean            Clean all generated files"
        @echo ""
        @echo "  Variables"
        @echo ""
        @echo "    MODEL_NAME         Name of the model to be built"
        @echo "    CORES              No of cores to use for compiling 
leptonica/tesseract"
        @echo "    LEPTONICA_VERSION  Leptonica version. Default: 
$(LEPTONICA_VERSION)"
        @echo "    TESSERACT_VERSION  Tesseract commit. Default: 
$(TESSERACT_VERSION)"
        @echo "    LANGDATA_VERSION   Tesseract langdata version. Default: 
$(LANGDATA_VERSION)"
        @echo "    TESSDATA_REPO      Tesseract model repo to use. Default: 
$(TESSDATA_REPO)"
        @echo "    TRAIN              Train directory"
        @echo "    RATIO_TRAIN        Ratio of train / eval training data"

# END-EVAL

# Ratio of train / eval training data
RATIO_TRAIN := 0.90

ALL_BOXES = data/all-boxes
ALL_LSTMF = data/all-lstmf

# Create unicharset
unicharset: data/unicharset

# Create lists of lstmf filenames for training and eval
#lists: $(ALL_LSTMF) data/list.train data/list.eval
lists: $(ALL_LSTMF)

train-lists: data/list.train data/list.eval

data/list.train: $(ALL_LSTMF)
        total=`cat $(ALL_LSTMF) | wc -l` \
           no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \
           head -n "$$no" $(ALL_LSTMF) > "$@"

data/list.eval: $(ALL_LSTMF)
        total=`cat $(ALL_LSTMF) | wc -l` \
           no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \
           tail -n "$$no" $(ALL_LSTMF) > "$@"

# Start training
training: data/$(MODEL_NAME).traineddata

data/unicharset: $(ALL_BOXES)
        mkdir -p data/$(START_MODEL)
        combine_tessdata -u $(TESSDATA)/$(CONTINUE_FROM).traineddata  
$(TESSDATA)/$(CONTINUE_FROM).
        unicharset_extractor --output_unicharset "$(TRAIN)/my.unicharset" 
--norm_mode $(NORM_MODE) "$(ALL_BOXES)"
        #merge_unicharsets data/$(START_MODEL)/$(START_MODEL).lstm-unicharset 
$(GROUND_TRUTH_DIR)/my.unicharset  "$@"
        merge_unicharsets $(TESSDATA)/$(CONTINUE_FROM).lstm-unicharset 
$(TRAIN)/my.unicharset  "$@"
        
$(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(TRAIN)/*.tif)))
        find $(TRAIN) -name '*.box' -exec cat {} \; > "$@"
        
$(TRAIN)/%.box: $(TRAIN)/%.tif $(TRAIN)/%.gt.txt
        python generate_line_box.py -i "$(TRAIN)/$*.tif" -t 
"$(TRAIN)/$*.gt.txt" > "$@"

$(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(TRAIN)/*.tif)))
        find $(TRAIN) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@"

$(TRAIN)/%.lstmf: $(TRAIN)/%.box
        tesseract $(TRAIN)/$*.tif $(TRAIN)/$* --dpi 300 --psm 7 lstm.train
        

# Build the proto model
proto-model: data/$(MODEL_NAME)/$(MODEL_NAME).traineddata

data/$(MODEL_NAME)/$(MODEL_NAME).traineddata: $(LANGDATA) data/unicharset
        combine_lang_model \
          --input_unicharset data/unicharset \
          --script_dir $(LANGDATA) \
          --words $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).wordlist \
          --numbers $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).numbers \
          --puncs $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).punc \
          --output_dir data/ \
          --lang $(MODEL_NAME)

data/checkpoints/$(MODEL_NAME)_checkpoint: unicharset proto-model
        mkdir -p data/checkpoints
        lstmtraining \
          --continue_from   $(TESSDATA)/$(CONTINUE_FROM).lstm \
          --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
          --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
          --model_output data/checkpoints/$(MODEL_NAME) \
          --debug_interval -1 \
          --train_listfile data/list.train \
          --eval_listfile data/list.eval \
          --sequential_training \
          --max_iterations 170000

data/$(MODEL_NAME).traineddata: data/checkpoints/$(MODEL_NAME)_checkpoint
        lstmtraining \
        --stop_training \
        --continue_from $^ \
        --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
        --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
        --model_output $@

# Clean all generated files
clean:
        find data/train -name '*.box' -delete
        find data/train -name '*.lstmf' -delete
        rm -rf data/all-*
        rm -rf data/list.*
        rm -rf data/$(MODEL_NAME)
        rm -rf data/unicharset
        rm -rf data/checkpoints

The number of .lstmf files being generated is significantly lower than .box 
files being generated.
For eg:
Number of .tif files: 10k
Number of .gt.txt files: 10k
Number of .box files: 10k
Number of .lstmf files: 8k.
Could anyone point me out to the possible reasons for this issue

On Friday, June 29, 2018 at 5:39:09 PM UTC+5:30, shree wrote:
>
> I modified the makefile for ocrd-train to do fine-tuning.  It is pasted 
> below:
>
> export
>
> SHELL := /bin/bash
> LOCAL := $(PWD)/usr
> PATH := $(LOCAL)/bin:$(PATH)
> HOME := /home/ubuntu
> TESSDATA =  $(HOME)/tessdata_best
> LANGDATA = $(HOME)/langdata
>
> # Name of the model to be built
> MODEL_NAME = frk
>
> # Name of the model to continue from
> CONTINUE_FROM = frk
>
> # Normalization Mode - see src/training/language_specific.sh for details 
> NORM_MODE = 2
>
> # Tesseract model repo to use. Default: $(TESSDATA_REPO)
> TESSDATA_REPO = _best
>
> # Train directory
> TRAIN := data/train
>
> # BEGIN-EVAL makefile-parser --make-help Makefile
>
> help:
> @echo ""
> @echo "  Targets"
> @echo ""
> @echo "    unicharset       Create unicharset"
> @echo "    lists            Create lists of lstmf filenames for training 
> and eval"
> @echo "    training         Start training"
> @echo "    proto-model      Build the proto model"
> @echo "    leptonica        Build leptonica"
> @echo "    tesseract        Build tesseract"
> @echo "    tesseract-langs  Download tesseract-langs"
> @echo "    langdata         Download langdata"
> @echo "    clean            Clean all generated files"
> @echo ""
> @echo "  Variables"
> @echo ""
> @echo "    MODEL_NAME         Name of the model to be built"
> @echo "    CORES              No of cores to use for compiling 
> leptonica/tesseract"
> @echo "    LEPTONICA_VERSION  Leptonica version. Default: 
> $(LEPTONICA_VERSION)"
> @echo "    TESSERACT_VERSION  Tesseract commit. Default: 
> $(TESSERACT_VERSION)"
> @echo "    LANGDATA_VERSION   Tesseract langdata version. Default: 
> $(LANGDATA_VERSION)"
> @echo "    TESSDATA_REPO      Tesseract model repo to use. Default: 
> $(TESSDATA_REPO)"
> @echo "    TRAIN              Train directory"
> @echo "    RATIO_TRAIN        Ratio of train / eval training data"
>
> # END-EVAL
>
> # Ratio of train / eval training data
> RATIO_TRAIN := 0.90
>
> ALL_BOXES = data/all-boxes
> ALL_LSTMF = data/all-lstmf
>
> # Create unicharset
> unicharset: data/unicharset
>
> # Create lists of lstmf filenames for training and eval
> lists: $(ALL_LSTMF) data/list.train data/list.eval
>
> data/list.train: $(ALL_LSTMF)
> total=`cat $(ALL_LSTMF) | wc -l` \
>    no=`echo "$$total * $(RATIO_TRAIN) / 1" | bc`; \
>    head -n "$$no" $(ALL_LSTMF) > "$@"
>
> data/list.eval: $(ALL_LSTMF)
> total=`cat $(ALL_LSTMF) | wc -l` \
>    no=`echo "($$total - $$total * $(RATIO_TRAIN)) / 1" | bc`; \
>    tail -n "+$$no" $(ALL_LSTMF) > "$@"
>
> # Start training
> training: data/$(MODEL_NAME).traineddata
>
> data/unicharset: $(ALL_BOXES)
> combine_tessdata -u $(TESSDATA)/$(CONTINUE_FROM).traineddata  
> $(TESSDATA)/$(CONTINUE_FROM).
> unicharset_extractor --output_unicharset "$(TRAIN)/my.unicharset" 
> --norm_mode $(NORM_MODE) "$(ALL_BOXES)"
> merge_unicharsets $(TESSDATA)/$(CONTINUE_FROM).lstm-unicharset 
> $(TRAIN)/my.unicharset  "$@"
> $(ALL_BOXES): $(sort $(patsubst %.tif,%.box,$(wildcard $(TRAIN)/*.tif)))
> find $(TRAIN) -name '*.box' -exec cat {} \; > "$@"
> $(TRAIN)/%.box: $(TRAIN)/%.tif $(TRAIN)/%-gt.txt
> python generate_line_box.py -i "$(TRAIN)/$*.tif" -t "$(TRAIN)/$*-gt.txt" > 
> "$@"
>
> $(ALL_LSTMF): $(sort $(patsubst %.tif,%.lstmf,$(wildcard $(TRAIN)/*.tif)))
> find $(TRAIN) -name '*.lstmf' -exec echo {} \; | sort -R -o "$@"
>
> $(TRAIN)/%.lstmf: $(TRAIN)/%.box
> tesseract $(TRAIN)/$*.tif $(TRAIN)/$*   --psm 6 lstm.train
>
> # Build the proto model
> proto-model: data/$(MODEL_NAME)/$(MODEL_NAME).traineddata
>
> data/$(MODEL_NAME)/$(MODEL_NAME).traineddata: $(LANGDATA) data/unicharset
> combine_lang_model \
>   --input_unicharset data/unicharset \
>   --script_dir $(LANGDATA) \
>   --words $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).wordlist \
>   --numbers $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).numbers \
>   --puncs $(LANGDATA)/$(MODEL_NAME)/$(MODEL_NAME).punc \
>   --output_dir data/ \
>   --lang $(MODEL_NAME)
>
> data/checkpoints/$(MODEL_NAME)_checkpoint: unicharset lists proto-model
> mkdir -p data/checkpoints
> lstmtraining \
>   --continue_from   $(TESSDATA)/$(CONTINUE_FROM).lstm \
>   --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
>   --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
>   --model_output data/checkpoints/$(MODEL_NAME) \
>   --debug_interval -1 \
>   --train_listfile data/list.train \
>   --eval_listfile data/list.eval \
>   --sequential_training \
>   --max_iterations 3000
>
> data/$(MODEL_NAME).traineddata: data/checkpoints/$(MODEL_NAME)_checkpoint
> lstmtraining \
> --stop_training \
> --continue_from $^ \
> --old_traineddata $(TESSDATA)/$(CONTINUE_FROM).traineddata \
> --traineddata data/$(MODEL_NAME)/$(MODEL_NAME).traineddata \
> --model_output $@
>
> # Clean all generated files
> clean:
> find data/train -name '*.box' -delete
> find data/train -name '*.lstmf' -delete
> rm -rf data/all-*
> rm -rf data/list.*
> rm -rf data/$(MODEL_NAME)
> rm -rf data/unicharset
> rm -rf data/checkpoints
>
> On Fri, Jun 29, 2018 at 5:31 PM Lorenzo Bolzani <l.bo...@gmail.com 
> <javascript:>> wrote:
>
>> 
>>
>> Hi,
>> I'm trying to do fine tuning of an existing model using line images and 
>> text labels. I'm running this version:
>>
>> tesseract 4.0.0-beta.3-56-g5fda
>>  leptonica-1.76.0
>>   libgif 5.1.4 : libjpeg 8d (libjpeg-turbo 1.4.2) : libpng 1.2.54 : 
>> libtiff 4.0.6 : zlib 1.2.8 : libwebp 0.4.4 : libopenjp2 2.3.0
>>  Found AVX2
>>  Found AVX
>>  Found SSE
>>
>>
>>
>> I used OCR-D to generate lstmf files for the demo data.
>>
>> If I run the make command it works fine. 
>>
>> make training MODEL_NAME=prova
>>
>> Now I isolated this command from the build:
>>
>> lstmtraining \
>>   --traineddata data/prova/prova.traineddata \
>>   --net_spec "[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c`head 
>> -n1 data/unicharset`]" \
>>   --model_output data/checkpoints/prova \
>>   --learning_rate 20e-4 \
>>   --train_listfile data/list.train \
>>   --eval_listfile data/list.eval \
>>   --max_iterations 10000
>>
>> and it works fine.
>>
>> Now I'm trying to modify it to fine tune the existing eng model. I made a 
>> few attempts, all ending into different errors (see the attached file for 
>> full output).
>>
>> I used:
>>
>> combine_tessdata -e /usr/local/share/tessdata/eng.traineddata 
>> extracted/eng.lstm
>>
>> to extract the eng.lstm model. 
>>
>> This seems to works but I'm not sure it is the correct.
>>
>> lstmtraining \
>>   --continue_from  extracted/eng.lstm \
>>   --traineddata data/prova/prova.traineddata \
>>   --old_traineddata extracted/eng.traineddata \
>>   --model_output data/checkpoints/prova \
>>   --learning_rate 20e-4 \
>>   --train_listfile data/list.train \
>>   --eval_listfile data/list.eval \
>>   --max_iterations 10000
>>
>> (extracted/eng.traineddata is just a copy of eng.traineddata)
>>
>>
>> The training resume exactly with the RMS of prova_checkpoint (6%) so it 
>> looks like it is training from that checkpoint, not the eng.lstm.
>>
>> Is this correct? What should I change?
>> 
>> I'm following this guide:
>>
>>
>> https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for--a-few-characters
>>
>> 
>> I think continue_from and traineddata should refer to the eng model and 
>> old_traineddata should point to prova.traineddata, but if I do that I get a 
>> segmentation fault:
>>
>> [...]
>> !int_mode_:Error:Assert failed:in file weightmatrix.cpp, line 244
>> !int_mode_:Error:Assert failed:in file weightmatrix.cpp, line 244
>> Segmentation fault
>>
>> What am I missing?
>>
>>
>> Thanks, bye
>>
>> Lorenzo
>>
>> -- 
>> You received this message because you are subscribed to the Google Groups 
>> "tesseract-ocr" group.
>> To unsubscribe from this group and stop receiving emails from it, send an 
>> email to tesser...@googlegroups.com <javascript:>.
>> To post to this group, send email to tesser...@googlegroups.com 
>> <javascript:>.
>> Visit this group at https://groups.google.com/group/tesseract-ocr.
>> To view this discussion on the web visit 
>> https://groups.google.com/d/msgid/tesseract-ocr/CAMgOLLyOJN31PdWQumXPO3JjuAc1Yz2BZYpMd4ftzBHgZkEaxA%40mail.gmail.com
>>  
>> <https://groups.google.com/d/msgid/tesseract-ocr/CAMgOLLyOJN31PdWQumXPO3JjuAc1Yz2BZYpMd4ftzBHgZkEaxA%40mail.gmail.com?utm_medium=email&utm_source=footer>
>> .
>> For more options, visit https://groups.google.com/d/optout.
>>
>
>
> -- 
>
> ____________________________________________________________
> भजन - कीर्तन - आरती @ http://bhajans.ramparivar.com
>

-- 
You received this message because you are subscribed to the Google Groups 
"tesseract-ocr" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to tesseract-ocr+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/tesseract-ocr/e3ba3b90-a8c8-4085-bec5-cf918034ba2a%40googlegroups.com.

Re: [tesseract-ocr] Fine tuning existing model

Reply via email to