Skip to content
Snippets Groups Projects
Commit 2b06876c authored by Damien Hansen's avatar Damien Hansen
Browse files

Adding preprocessing steps

parent d8f7be62
No related branches found
No related tags found
No related merge requests found
#!/bin/sh
# Directories
#############
echo Choose directory in the data folder
read DIR
export DIR
mkdir -p ./data/${DIR}/subword
mkdir -p ./data/${DIR}/vocab
mkdir -p ./data/${DIR}/tok
# Tokenization
##############
sacremoses -l en tokenize < ./data/${DIR}/trn.en > ./data/${DIR}/tok/trn.en
sacremoses -l fr tokenize < ./data/${DIR}/trn.fr > ./data/${DIR}/tok/trn.fr
sacremoses -l en tokenize < ./data/${DIR}/val.en > ./data/${DIR}/tok/val.en
sacremoses -l fr tokenize < ./data/${DIR}/val.fr > ./data/${DIR}/tok/val.fr
sacremoses -l en tokenize < ./data/${DIR}/tra.en > ./data/${DIR}/tok/tra.en
sacremoses -l fr tokenize < ./data/${DIR}/tra.fr > ./data/${DIR}/tok/tra.fr
# Subword segmentation
######################
# For fine-tuning
spm_train \
--input=./data/books/tok/trn.en,./data/europarl/tok/trn.en,./data/globalvoices/tok/trn.en,./data/news/tok/trn.en,./data/ted/tok/trn.en,./data/${DIR}/tok/trn.en, \
--model_prefix=./data/${DIR}/subword/unigram_multi_en \
--vocab_size=32000 \
--character_coverage=1.0 \
--model_type=unigram
spm_train \
--input=./data/books/tok/trn.fr,./data/europarl/tok/trn.fr,./data/globalvoices/tok/trn.fr,./data/news/tok/trn.fr,./data/ted/tok/trn.fr,./data/${DIR}/tok/trn.fr \
--model_prefix=./data/${DIR}/subword/unigram_multi_fr \
--vocab_size=32000 \
--character_coverage=1.0 \
--model_type=unigram
spm_encode \
--model=./data/${DIR}/subword/unigram_multi_en.model \
--output_format=piece \
< ./data/${DIR}/tok/tra.en \
> ./data/${DIR}/tok/tra_unigram_multi.en
# For video game only
spm_train \
--input=./data/${DIR}/tok/trn.en, \
--model_prefix=./data/${DIR}/subword/unigram_only_en \
--vocab_size=32000 \
--character_coverage=1.0 \
--model_type=unigram
spm_train \
--input=./data/${DIR}/tok/trn.fr \
--model_prefix=./data/${DIR}/subword/unigram_only_fr \
--vocab_size=32000 \
--character_coverage=1.0 \
--model_type=unigram
spm_encode \
--model=./data/${DIR}/subword/unigram_only_en.model \
--output_format=piece \
< ./data/${DIR}/tok/tra.en \
> ./data/${DIR}/tok/tra_unigram_only.en
# Building vocab
################
onmt_build_vocab --config configs/${DIR}_only.yaml --n_sample -1
onmt_build_vocab --config configs/${DIR}_tuned.yaml --n_sample -1
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment