diff --git a/preprocess.sh b/preprocess.sh new file mode 100644 index 0000000000000000000000000000000000000000..fa31b72bb4f6451f2f95b51799ce0564b2079e36 --- /dev/null +++ b/preprocess.sh @@ -0,0 +1,75 @@ +#!/bin/sh + +# Directories +############# + +echo Choose directory in the data folder +read DIR +export DIR +mkdir -p ./data/${DIR}/subword +mkdir -p ./data/${DIR}/vocab +mkdir -p ./data/${DIR}/tok + +# Tokenization +############## + +sacremoses -l en tokenize < ./data/${DIR}/trn.en > ./data/${DIR}/tok/trn.en +sacremoses -l fr tokenize < ./data/${DIR}/trn.fr > ./data/${DIR}/tok/trn.fr +sacremoses -l en tokenize < ./data/${DIR}/val.en > ./data/${DIR}/tok/val.en +sacremoses -l fr tokenize < ./data/${DIR}/val.fr > ./data/${DIR}/tok/val.fr +sacremoses -l en tokenize < ./data/${DIR}/tra.en > ./data/${DIR}/tok/tra.en +sacremoses -l fr tokenize < ./data/${DIR}/tra.fr > ./data/${DIR}/tok/tra.fr + +# Subword segmentation +###################### + +# For fine-tuning + +spm_train \ + --input=./data/books/tok/trn.en,./data/europarl/tok/trn.en,./data/globalvoices/tok/trn.en,./data/news/tok/trn.en,./data/ted/tok/trn.en,./data/${DIR}/tok/trn.en, \ + --model_prefix=./data/${DIR}/subword/unigram_multi_en \ + --vocab_size=32000 \ + --character_coverage=1.0 \ + --model_type=unigram + +spm_train \ + --input=./data/books/tok/trn.fr,./data/europarl/tok/trn.fr,./data/globalvoices/tok/trn.fr,./data/news/tok/trn.fr,./data/ted/tok/trn.fr,./data/${DIR}/tok/trn.fr \ + --model_prefix=./data/${DIR}/subword/unigram_multi_fr \ + --vocab_size=32000 \ + --character_coverage=1.0 \ + --model_type=unigram + +spm_encode \ + --model=./data/${DIR}/subword/unigram_multi_en.model \ + --output_format=piece \ + < ./data/${DIR}/tok/tra.en \ + > ./data/${DIR}/tok/tra_unigram_multi.en + +# For video game only + +spm_train \ + --input=./data/${DIR}/tok/trn.en, \ + --model_prefix=./data/${DIR}/subword/unigram_only_en \ + --vocab_size=32000 \ + --character_coverage=1.0 \ + --model_type=unigram + +spm_train \ + --input=./data/${DIR}/tok/trn.fr \ + --model_prefix=./data/${DIR}/subword/unigram_only_fr \ + --vocab_size=32000 \ + --character_coverage=1.0 \ + --model_type=unigram + +spm_encode \ + --model=./data/${DIR}/subword/unigram_only_en.model \ + --output_format=piece \ + < ./data/${DIR}/tok/tra.en \ + > ./data/${DIR}/tok/tra_unigram_only.en + +# Building vocab +################ + +onmt_build_vocab --config configs/${DIR}_only.yaml --n_sample -1 + +onmt_build_vocab --config configs/${DIR}_tuned.yaml --n_sample -1 \ No newline at end of file