#!/bin/sh # Directories ############# echo Choose directory in the data folder read DIR export DIR mkdir -p ./data/${DIR}/subword mkdir -p ./data/${DIR}/vocab mkdir -p ./data/${DIR}/tok # Tokenization ############## sacremoses -l en tokenize < ./data/${DIR}/trn.en > ./data/${DIR}/tok/trn.en sacremoses -l fr tokenize < ./data/${DIR}/trn.fr > ./data/${DIR}/tok/trn.fr sacremoses -l en tokenize < ./data/${DIR}/val.en > ./data/${DIR}/tok/val.en sacremoses -l fr tokenize < ./data/${DIR}/val.fr > ./data/${DIR}/tok/val.fr sacremoses -l en tokenize < ./data/${DIR}/tra.en > ./data/${DIR}/tok/tra.en sacremoses -l fr tokenize < ./data/${DIR}/tra.fr > ./data/${DIR}/tok/tra.fr # Subword segmentation ###################### # For fine-tuning spm_train \ --input=./data/books/tok/trn.en,./data/europarl/tok/trn.en,./data/globalvoices/tok/trn.en,./data/news/tok/trn.en,./data/ted/tok/trn.en,./data/${DIR}/tok/trn.en, \ --model_prefix=./data/${DIR}/subword/unigram_multi_en \ --vocab_size=32000 \ --character_coverage=1.0 \ --model_type=unigram spm_train \ --input=./data/books/tok/trn.fr,./data/europarl/tok/trn.fr,./data/globalvoices/tok/trn.fr,./data/news/tok/trn.fr,./data/ted/tok/trn.fr,./data/${DIR}/tok/trn.fr \ --model_prefix=./data/${DIR}/subword/unigram_multi_fr \ --vocab_size=32000 \ --character_coverage=1.0 \ --model_type=unigram spm_encode \ --model=./data/${DIR}/subword/unigram_multi_en.model \ --output_format=piece \ < ./data/${DIR}/tok/tra.en \ > ./data/${DIR}/tok/tra_unigram_multi.en # For video game only spm_train \ --input=./data/${DIR}/tok/trn.en, \ --model_prefix=./data/${DIR}/subword/unigram_only_en \ --vocab_size=32000 \ --character_coverage=1.0 \ --model_type=unigram spm_train \ --input=./data/${DIR}/tok/trn.fr \ --model_prefix=./data/${DIR}/subword/unigram_only_fr \ --vocab_size=32000 \ --character_coverage=1.0 \ --model_type=unigram spm_encode \ --model=./data/${DIR}/subword/unigram_only_en.model \ --output_format=piece \ < ./data/${DIR}/tok/tra.en \ > ./data/${DIR}/tok/tra_unigram_only.en # Building vocab ################ onmt_build_vocab --config configs/${DIR}_only.yaml --n_sample -1 onmt_build_vocab --config configs/${DIR}_tuned.yaml --n_sample -1