From f8672c2ef9e0bcdfc3086662e37f02a9707ffe8a Mon Sep 17 00:00:00 2001
From: Damien Hansen <damien.hansen@uliege.be>
Date: Sat, 24 Feb 2024 18:02:16 +0000
Subject: [PATCH] Upload preprocessing

---
 preprocess.sh | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 preprocess.sh

diff --git a/preprocess.sh b/preprocess.sh
new file mode 100644
index 0000000..b1854ab
--- /dev/null
+++ b/preprocess.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+# Directories
+#############
+
+mkdir -p ./{output,subword,vocab}
+
+# Tokenization
+##############
+
+for source in ./data/*/*.en; do
+	output=$(dirname $source)/$(basename $source .en)_tok.en
+	perl tokenizer.perl -l en -lc < $source > $output
+	truncate -s -1 $output
+done
+
+for target in ./data/*/*.fr; do
+	output=$(dirname $target)/$(basename $target .fr)_tok.fr
+	perl tokenizer.perl -l fr -lc < $target > $output
+	truncate -s -1 $output
+done
+
+# Subword segmentation
+######################
+
+spm_train \
+	--input=./data/{books,europarl,globalvoices,jv,news,parallel,serval,septimus,synthetic,ted}/trn_tok.en \
+	--model_prefix=./subword/septimus_en \
+	--vocab_size=16000 \
+	--character_coverage=1.0 \
+	--model_type=unigram
+
+spm_train \
+	--input=./data/{books,europarl,globalvoices,jv,news,parallel,serval,septimus,synthetic,ted}/trn_tok.fr \
+	--model_prefix=./subword/septimus_fr \
+	--vocab_size=16000 \
+	--character_coverage=1.0 \
+	--model_type=unigram
+
+spm_encode \
+	--model=./subword/septimus_en.model \
+	--output_format=piece \
+	< ./data/septimus/tra_tok.en \
+	> ./data/septimus/tra_sub.en
+
+truncate -s -1 ./data/septimus/tra_sub.en
+
+# Building vocab
+################
+
+onmt_build_vocab --config configs/transformer_tuned.yaml --n_sample -1
\ No newline at end of file
-- 
GitLab