From 182018e9ab31293558cd007a05870d5f7b6a1d0f Mon Sep 17 00:00:00 2001 From: Justin Barton Date: Sat, 9 Sep 2023 02:15:36 +0100 Subject: [PATCH] adding Huggingface compatible tokenizers --- .../apt_tokenizer/special_tokens_map.json | 7 + .../tokenizer/apt_tokenizer/tokenizer.json | 172 ++++++++++++++++++ .../apt_tokenizer/tokenizer_config.json | 10 + .../esm_tokenizer/tokenizer_config.json | 4 + protein_lm/tokenizer/esm_tokenizer/vocab.txt | 33 ++++ protein_lm/tokenizer/hf_tokenizer.py | 13 ++ 6 files changed, 239 insertions(+) create mode 100644 protein_lm/tokenizer/apt_tokenizer/special_tokens_map.json create mode 100644 protein_lm/tokenizer/apt_tokenizer/tokenizer.json create mode 100644 protein_lm/tokenizer/apt_tokenizer/tokenizer_config.json create mode 100644 protein_lm/tokenizer/esm_tokenizer/tokenizer_config.json create mode 100644 protein_lm/tokenizer/esm_tokenizer/vocab.txt create mode 100644 protein_lm/tokenizer/hf_tokenizer.py diff --git a/protein_lm/tokenizer/apt_tokenizer/special_tokens_map.json b/protein_lm/tokenizer/apt_tokenizer/special_tokens_map.json new file mode 100644 index 0000000..ba0f9b5 --- /dev/null +++ b/protein_lm/tokenizer/apt_tokenizer/special_tokens_map.json @@ -0,0 +1,7 @@ +{ + "cls_token": "", + "eos_token": "", + "mask_token": "", + "pad_token": "", + "unk_token": "" +} diff --git a/protein_lm/tokenizer/apt_tokenizer/tokenizer.json b/protein_lm/tokenizer/apt_tokenizer/tokenizer.json new file mode 100644 index 0000000..eb8b639 --- /dev/null +++ b/protein_lm/tokenizer/apt_tokenizer/tokenizer.json @@ -0,0 +1,172 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 27, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 28, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + } + ], + "pair": [ + { + "SpecialToken": { + "id": "", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + }, + { + "SpecialToken": { + "id": "", + "type_id": 1 + } + } + ], + "special_tokens": { + "": { + "id": "", + "ids": [ + 0 + ], + "tokens": [ + "" + ] + }, + "": { + "id": "", + "ids": [ + 2 + ], + "tokens": [ + "" + ] + } + } + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "L": 3, + "A": 4, + "G": 5, + "V": 6, + "S": 7, + "E": 8, + "R": 9, + "T": 10, + "I": 11, + "D": 12, + "P": 13, + "K": 14, + "Q": 15, + "N": 16, + "F": 17, + "Y": 18, + "M": 19, + "H": 20, + "W": 21, + "C": 22, + "B": 23, + "U": 24, + "Z": 25, + "O": 26, + "": 27, + "": 28 + }, + "merges": [] + } +} \ No newline at end of file diff --git a/protein_lm/tokenizer/apt_tokenizer/tokenizer_config.json b/protein_lm/tokenizer/apt_tokenizer/tokenizer_config.json new file mode 100644 index 0000000..760154b --- /dev/null +++ b/protein_lm/tokenizer/apt_tokenizer/tokenizer_config.json @@ -0,0 +1,10 @@ +{ + "clean_up_tokenization_spaces": true, + "cls_token": "", + "eos_token": "", + "mask_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "tokenizer_class": "PreTrainedTokenizerFast", + "unk_token": "" +} diff --git a/protein_lm/tokenizer/esm_tokenizer/tokenizer_config.json b/protein_lm/tokenizer/esm_tokenizer/tokenizer_config.json new file mode 100644 index 0000000..3f0d47e --- /dev/null +++ b/protein_lm/tokenizer/esm_tokenizer/tokenizer_config.json @@ -0,0 +1,4 @@ +{ + "model_max_length": 1000000000000000019884624838656, + "tokenizer_class": "EsmTokenizer" +} diff --git a/protein_lm/tokenizer/esm_tokenizer/vocab.txt b/protein_lm/tokenizer/esm_tokenizer/vocab.txt new file mode 100644 index 0000000..6b94695 --- /dev/null +++ b/protein_lm/tokenizer/esm_tokenizer/vocab.txt @@ -0,0 +1,33 @@ + + + + +L +A +G +V +S +E +R +T +I +D +P +K +Q +N +F +Y +M +H +W +C +X +B +U +Z +O +. +- + + \ No newline at end of file diff --git a/protein_lm/tokenizer/hf_tokenizer.py b/protein_lm/tokenizer/hf_tokenizer.py new file mode 100644 index 0000000..2ce5fb4 --- /dev/null +++ b/protein_lm/tokenizer/hf_tokenizer.py @@ -0,0 +1,13 @@ +import os +from transformers import PreTrainedTokenizerFast +from transformers import EsmTokenizer as EsmTokenizerBase + +esm_path = os.path.join(os.path.dirname(__file__), 'esm_tokenizer') +apt_path = os.path.join(os.path.dirname(__file__), 'apt_tokenizer') + + +def EsmTokenizer(): + return EsmTokenizerBase.from_pretrained(esm_path) + +def AptTokenizer(): + return PreTrainedTokenizerFast.from_pretrained(apt_path)