diff --git a/train.sh b/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..acff7429e8d01df224ed06a453f37d9f9101d7d7 --- /dev/null +++ b/train.sh @@ -0,0 +1,107 @@ +#!/bin/sh + +# Directories +############# + +echo Choose directory in the data folder +read DIR +export DIR +echo Create directory in the output folder +read OUT +export OUT +mkdir -p ./out/${OUT} +mkdir -p ./out/${OUT}/models +mkdir -p ./out/${OUT}/translations +mkdir -p ./out/${OUT}/logs +mkdir -p ./out/${OUT}/tensor + +# Training +########## + +onmt_train --config ./configs/${DIR}_only.yaml +onmt_train --config ./configs/${DIR}_train.yaml +onmt_train --config ./configs/${DIR}_tuned.yaml + +# Translation +############# + +# For fine-tuned models + +for checkpoint in ./out/${OUT}/models/${DIR}_t*.pt ; do + filename=$(basename $checkpoint .pt) + echo "# Translating checkpoint" ${filename} + onmt_translate \ + --verbose \ + --replace_unk \ + --model $checkpoint \ + --src ./data/${DIR}/tok/tra_unigram_multi.en \ + --output ./out/${OUT}/translations/${filename}_uni.txt +done + +for file in ./out/${OUT}/translations/*_uni.txt ; do + filename=$(basename $file _uni.txt) + spm_decode \ + --model=./data/${DIR}/subword/unigram_multi_fr.model \ + --input_format=piece \ + < ./out/${OUT}/translations/${filename%.*}_uni.txt \ + > ./out/${OUT}/translations/${filename%.*}_tok.txt +done + +for file in ./out/${OUT}/translations/*_tok.txt ; do + filename=$(basename $file _tok.txt) + sacremoses \ + -l fr \ + detokenize \ + < ./out/${OUT}/translations/${filename%.*}_tok.txt \ + > ./out/${OUT}/translations/${filename%.*}.txt +done + +rm ./out/${OUT}/translations/*uni.txt +rm ./out/${OUT}/translations/*tok.txt + +# For video game only models + +for checkpoint in ./out/${OUT}/models/${DIR}_o*.pt ; do + filename=$(basename $checkpoint .pt) + echo "# Translating checkpoint" ${filename} + onmt_translate \ + --verbose \ + --replace_unk \ + --model $checkpoint \ + --src ./data/${DIR}/tok/tra_unigram_only.en \ + --output ./out/${OUT}/translations/${filename}_uni.txt +done + +for file in ./out/${OUT}/translations/*_uni.txt ; do + filename=$(basename $file _uni.txt) + spm_decode \ + --model=./data/${DIR}/subword/unigram_only_fr.model \ + --input_format=piece \ + < ./out/${OUT}/translations/${filename%.*}_uni.txt \ + > ./out/${OUT}/translations/${filename%.*}_tok.txt +done + +for file in ./out/${OUT}/translations/*_tok.txt ; do + filename=$(basename $file _tok.txt) + sacremoses \ + -l fr \ + detokenize \ + < ./out/${OUT}/translations/${filename%.*}_tok.txt \ + > ./out/${OUT}/translations/${filename%.*}.txt +done + +rm ./out/${OUT}/translations/*uni.txt +rm ./out/${OUT}/translations/*tok.txt + +# Evaluation +############ + +sacrebleu ./data/${DIR}/tra.fr \ + --input ./out/${OUT}/translations/*.txt \ + --language-pair en-fr \ + --metrics bleu chrf ter \ + --chrf-word-order 2 \ + --tokenize 13a \ + --width 2 \ + --format text \ + >> ./out/${OUT}/BLEU.txt \ No newline at end of file