diff --git a/preprocess.sh b/preprocess.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1854abbe1afedda5d5ee1113cdb9b37fe4788ea --- /dev/null +++ b/preprocess.sh @@ -0,0 +1,51 @@ +#!/bin/sh + +# Directories +############# + +mkdir -p ./{output,subword,vocab} + +# Tokenization +############## + +for source in ./data/*/*.en; do + output=$(dirname $source)/$(basename $source .en)_tok.en + perl tokenizer.perl -l en -lc < $source > $output + truncate -s -1 $output +done + +for target in ./data/*/*.fr; do + output=$(dirname $target)/$(basename $target .fr)_tok.fr + perl tokenizer.perl -l fr -lc < $target > $output + truncate -s -1 $output +done + +# Subword segmentation +###################### + +spm_train \ + --input=./data/{books,europarl,globalvoices,jv,news,parallel,serval,septimus,synthetic,ted}/trn_tok.en \ + --model_prefix=./subword/septimus_en \ + --vocab_size=16000 \ + --character_coverage=1.0 \ + --model_type=unigram + +spm_train \ + --input=./data/{books,europarl,globalvoices,jv,news,parallel,serval,septimus,synthetic,ted}/trn_tok.fr \ + --model_prefix=./subword/septimus_fr \ + --vocab_size=16000 \ + --character_coverage=1.0 \ + --model_type=unigram + +spm_encode \ + --model=./subword/septimus_en.model \ + --output_format=piece \ + < ./data/septimus/tra_tok.en \ + > ./data/septimus/tra_sub.en + +truncate -s -1 ./data/septimus/tra_sub.en + +# Building vocab +################ + +onmt_build_vocab --config configs/transformer_tuned.yaml --n_sample -1 \ No newline at end of file