diff --git a/.gitignore b/.gitignore index 4a87b23..943dd15 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ __pycache__/ rustbpe/target/ dev-ignore/ report.md -eval_bundle/ \ No newline at end of file +eval_bundle/ +*.parquet \ No newline at end of file diff --git a/data/climbmix_small/detokenize_climbmix.py b/data/nemotron_climbmix/detokenize_climbmix.py similarity index 100% rename from data/climbmix_small/detokenize_climbmix.py rename to data/nemotron_climbmix/detokenize_climbmix.py diff --git a/data/climbmix_small/download_climbmix.py b/data/nemotron_climbmix/download_climbmix.py similarity index 100% rename from data/climbmix_small/download_climbmix.py rename to data/nemotron_climbmix/download_climbmix.py diff --git a/data/climbmix_small/process_climbmix.sh b/data/nemotron_climbmix/process_climbmix.sh similarity index 59% rename from data/climbmix_small/process_climbmix.sh rename to data/nemotron_climbmix/process_climbmix.sh index 8de41e5..9bfca04 100644 --- a/data/climbmix_small/process_climbmix.sh +++ b/data/nemotron_climbmix/process_climbmix.sh @@ -1,6 +1,3 @@ -# pip install -U "huggingface_hub[hf_transfer]" -# export HF_HUB_ENABLE_HF_TRANSFER=1 - python download_climbmix.py python detokenize_climbmix.py --input_folder climbmix_small --output_folder ./ rm -rf climbmix_small \ No newline at end of file