From f8672c2ef9e0bcdfc3086662e37f02a9707ffe8a Mon Sep 17 00:00:00 2001 From: Damien Hansen <damien.hansen@uliege.be> Date: Sat, 24 Feb 2024 18:02:16 +0000 Subject: [PATCH] Upload preprocessing --- preprocess.sh | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 preprocess.sh diff --git a/preprocess.sh b/preprocess.sh new file mode 100644 index 0000000..b1854ab --- /dev/null +++ b/preprocess.sh @@ -0,0 +1,51 @@ +#!/bin/sh + +# Directories +############# + +mkdir -p ./{output,subword,vocab} + +# Tokenization +############## + +for source in ./data/*/*.en; do + output=$(dirname $source)/$(basename $source .en)_tok.en + perl tokenizer.perl -l en -lc < $source > $output + truncate -s -1 $output +done + +for target in ./data/*/*.fr; do + output=$(dirname $target)/$(basename $target .fr)_tok.fr + perl tokenizer.perl -l fr -lc < $target > $output + truncate -s -1 $output +done + +# Subword segmentation +###################### + +spm_train \ + --input=./data/{books,europarl,globalvoices,jv,news,parallel,serval,septimus,synthetic,ted}/trn_tok.en \ + --model_prefix=./subword/septimus_en \ + --vocab_size=16000 \ + --character_coverage=1.0 \ + --model_type=unigram + +spm_train \ + --input=./data/{books,europarl,globalvoices,jv,news,parallel,serval,septimus,synthetic,ted}/trn_tok.fr \ + --model_prefix=./subword/septimus_fr \ + --vocab_size=16000 \ + --character_coverage=1.0 \ + --model_type=unigram + +spm_encode \ + --model=./subword/septimus_en.model \ + --output_format=piece \ + < ./data/septimus/tra_tok.en \ + > ./data/septimus/tra_sub.en + +truncate -s -1 ./data/septimus/tra_sub.en + +# Building vocab +################ + +onmt_build_vocab --config configs/transformer_tuned.yaml --n_sample -1 \ No newline at end of file -- GitLab