From 182018e9ab31293558cd007a05870d5f7b6a1d0f Mon Sep 17 00:00:00 2001
From: Justin Barton <justinbarton@gmail.com>
Date: Sat, 9 Sep 2023 02:15:36 +0100
Subject: [PATCH] adding Huggingface compatible tokenizers

---
 .../apt_tokenizer/special_tokens_map.json     |   7 +
 .../tokenizer/apt_tokenizer/tokenizer.json    | 172 ++++++++++++++++++
 .../apt_tokenizer/tokenizer_config.json       |  10 +
 .../esm_tokenizer/tokenizer_config.json       |   4 +
 protein_lm/tokenizer/esm_tokenizer/vocab.txt  |  33 ++++
 protein_lm/tokenizer/hf_tokenizer.py          |  13 ++
 6 files changed, 239 insertions(+)
 create mode 100644 protein_lm/tokenizer/apt_tokenizer/special_tokens_map.json
 create mode 100644 protein_lm/tokenizer/apt_tokenizer/tokenizer.json
 create mode 100644 protein_lm/tokenizer/apt_tokenizer/tokenizer_config.json
 create mode 100644 protein_lm/tokenizer/esm_tokenizer/tokenizer_config.json
 create mode 100644 protein_lm/tokenizer/esm_tokenizer/vocab.txt
 create mode 100644 protein_lm/tokenizer/hf_tokenizer.py
diff --git a/protein_lm/tokenizer/apt_tokenizer/special_tokens_map.json b/protein_lm/tokenizer/apt_tokenizer/special_tokens_map.json
new file mode 100644
index 0000000..ba0f9b5
--- /dev/null
+++ b/protein_lm/tokenizer/apt_tokenizer/special_tokens_map.json
@@ -0,0 +1,7 @@
+{
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}
diff --git a/protein_lm/tokenizer/apt_tokenizer/tokenizer.json b/protein_lm/tokenizer/apt_tokenizer/tokenizer.json
new file mode 100644
index 0000000..eb8b639
--- /dev/null
+++ b/protein_lm/tokenizer/apt_tokenizer/tokenizer.json
@@ -0,0 +1,172 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<cls>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 27,
+      "content": "<mask>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 28,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<cls>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<eos>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "<cls>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<eos>",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<cls>": {
+        "id": "<cls>",
+        "ids": [
+          0
+        ],
+        "tokens": [
+          "<cls>"
+        ]
+      },
+      "<eos>": {
+        "id": "<eos>",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "<eos>"
+        ]
+      }
+    }
+  },
+  "decoder": {
+    "type": "ByteLevel",
+    "add_prefix_space": true,
+    "trim_offsets": true,
+    "use_regex": true
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<unk>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "vocab": {
+      "<cls>": 0,
+      "<pad>": 1,
+      "<eos>": 2,
+      "L": 3,
+      "A": 4,
+      "G": 5,
+      "V": 6,
+      "S": 7,
+      "E": 8,
+      "R": 9,
+      "T": 10,
+      "I": 11,
+      "D": 12,
+      "P": 13,
+      "K": 14,
+      "Q": 15,
+      "N": 16,
+      "F": 17,
+      "Y": 18,
+      "M": 19,
+      "H": 20,
+      "W": 21,
+      "C": 22,
+      "B": 23,
+      "U": 24,
+      "Z": 25,
+      "O": 26,
+      "<mask>": 27,
+      "<unk>": 28
+    },
+    "merges": []
+  }
+}
\ No newline at end of file
diff --git a/protein_lm/tokenizer/apt_tokenizer/tokenizer_config.json b/protein_lm/tokenizer/apt_tokenizer/tokenizer_config.json
new file mode 100644
index 0000000..760154b
--- /dev/null
+++ b/protein_lm/tokenizer/apt_tokenizer/tokenizer_config.json
@@ -0,0 +1,10 @@
+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}
diff --git a/protein_lm/tokenizer/esm_tokenizer/tokenizer_config.json b/protein_lm/tokenizer/esm_tokenizer/tokenizer_config.json
new file mode 100644
index 0000000..3f0d47e
--- /dev/null
+++ b/protein_lm/tokenizer/esm_tokenizer/tokenizer_config.json
@@ -0,0 +1,4 @@
+{
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "EsmTokenizer"
+}
diff --git a/protein_lm/tokenizer/esm_tokenizer/vocab.txt b/protein_lm/tokenizer/esm_tokenizer/vocab.txt
new file mode 100644
index 0000000..6b94695
--- /dev/null
+++ b/protein_lm/tokenizer/esm_tokenizer/vocab.txt
@@ -0,0 +1,33 @@
+<cls>
+<pad>
+<eos>
+<unk>
+L
+A
+G
+V
+S
+E
+R
+T
+I
+D
+P
+K
+Q
+N
+F
+Y
+M
+H
+W
+C
+X
+B
+U
+Z
+O
+.
+-
+<null_1>
+<mask>
\ No newline at end of file
diff --git a/protein_lm/tokenizer/hf_tokenizer.py b/protein_lm/tokenizer/hf_tokenizer.py
new file mode 100644
index 0000000..2ce5fb4
--- /dev/null
+++ b/protein_lm/tokenizer/hf_tokenizer.py
@@ -0,0 +1,13 @@
+import os
+from transformers import PreTrainedTokenizerFast
+from transformers import EsmTokenizer as EsmTokenizerBase
+
+esm_path = os.path.join(os.path.dirname(__file__), 'esm_tokenizer')
+apt_path = os.path.join(os.path.dirname(__file__), 'apt_tokenizer')
+
+
+def EsmTokenizer():
+    return EsmTokenizerBase.from_pretrained(esm_path)
+
+def AptTokenizer():
+    return PreTrainedTokenizerFast.from_pretrained(apt_path)