diff --git a/.gitignore b/.gitignore index 9051b46..5270c16 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ protein_lm/dataset/ProteinGym/ protein_lm/evaluation/output/ /esm2*/ -/toy/ +/toy*/ *.lock *.pyc wandb/ @@ -9,3 +9,4 @@ checkpoints/ __pycache__/ protein_lm.egg-info/ *.DS_Store +/.vs/ diff --git a/protein_lm/configs/train/toy_hf.yaml b/protein_lm/configs/train/toy_hf.yaml index 8ee01d4..c75ff09 100644 --- a/protein_lm/configs/train/toy_hf.yaml +++ b/protein_lm/configs/train/toy_hf.yaml @@ -49,6 +49,8 @@ model: nn_model_type: "APT" nn_model_config_args: position_embedding: "learned" + rope_scaling_factor: 1.0 + rope_theta: 10000 max_sequence_length: 10 pretrained_checkpoint: null diff --git a/protein_lm/dataset/clustering_metrics.py b/protein_lm/dataset/clustering_metrics.py new file mode 100644 index 0000000..05cf273 --- /dev/null +++ b/protein_lm/dataset/clustering_metrics.py @@ -0,0 +1,251 @@ +# %% +from math import log2 +from Bio import SeqIO + +import numpy as np +from sklearn.linear_model import LinearRegression +from transformers import AutoTokenizer, AutoModel, EsmModel +import torch +from scipy.sparse.csgraph import minimum_spanning_tree + + + +# %% +# simple metrics +def get_AA_counts(seq, alphabet): + """computes the counts of AAs in a seq""" + AA_counts = {x:0 for x in alphabet} + for AA in alphabet: + AA_counts[AA] = seq.count(AA) + return AA_counts + +def get_frquency(seq, alphabet): + """computes the frequency of AAs in seq""" + AA_counts = get_AA_counts(seq, alphabet) + return {k: v/len(seq) for k,v in AA_counts.items()} + + +def compute_entropy(seq, alphabet): + """ + computes single seq entropy, + as zero freq AAs are a possibility, we skip zero frqs in the sumation. + I thought that would be better than smoothing with 1 or another constant, but I am not sure. + I found a paper where it is also done that way: + https://academic.oup.com/mbe/article/40/4/msad084/7111731 + """ + AA_counts = get_frquency(seq, alphabet) + E_dict = {k: v * log2(v) for k, v in AA_counts.items() if v != 0} + E = - sum(E_dict.values()) + return E + +def compute_kullback_leibler(seq, alphabet, background_freq=None): + """ + computesKL-divergence given a background frequency + as zero freq AAs are a possibility, we skip zero frqs in the sumation. + if AAs in background equal zero, this will also throw an error! + TODO: either drop similarly to AA freq in seq or smooth? + """ + # set background to 0.05 for each if none given + if background_freq == None: + background_freq = {'A': 0.05, + 'R': 0.05, + 'N': 0.05, + 'D': 0.05, + 'C': 0.05, + 'E': 0.05, + 'Q': 0.05, + 'G': 0.05, + 'H': 0.05, + 'I': 0.05, + 'L': 0.05, + 'K': 0.05, + 'M': 0.05, + 'F': 0.05, + 'P': 0.05, + 'S': 0.05, + 'T': 0.05, + 'W': 0.05, + 'Y': 0.05, + 'V': 0.05 + } + + AA_counts = get_frquency(seq, alphabet) + KLD_dict = {k: v * log2(v/background_freq[k]) for k, v in AA_counts.items() if v != 0} + KLD = sum(KLD_dict.values()) + return KLD + + +def process_sequence(rec): + """ + Process a single sequence and return counts and frequencies + """ + seq = str(rec.seq) + counts = get_AA_counts(seq, alphabet) + frequency = get_frquency(seq, alphabet) + return counts, frequency + + +def get_background_from_fasta_no_alignment(fasta_file, alphabet, num_seqs): + """ + iterates over fasta to get the AA frequencies of all seqs. + """ + fasta_iterator = SeqIO.parse(fasta_file, "fasta") + + # Initialize dictionaries to store counts and frequencies + total_frequency = {x: 0 for x in alphabet} + + # Iterate over the records in the FASTA file + for record in fasta_iterator: + # Get sequence as a string + seq = str(record.seq) + + # Compute counts and frequencies for this sequence + frequency = get_frquency(seq, alphabet) + + # Accumulate counts and frequencies + for aa in alphabet: + total_frequency[aa] += frequency[aa] + + # Normalize frequencies by the number of sequences + num_sequences = sum(1 for _ in SeqIO.parse(fasta_file, "fasta")) + for aa in alphabet: + total_frequency[aa] /= num_seqs + + return total_frequency + +def compute_KLD_fasta(fasta_file, alphabet, background_freq): + """ + computes the KLD witht the background of the given fasta. + """ + + fasta_iterator = SeqIO.parse(fasta_file, "fasta") + + KLDs = {} + + for rec in fasta_iterator: + # get ID and seq, pack into dict as id:KLD + KLDs[rec.id] = compute_kullback_leibler(str(rec.seq), alphabet, background_freq) + + return KLDs + +# %% +# intrinsic dimension as suggested by @Amelie-Schreiber +# https://huggingface.co/blog/AmelieSchreiber/intrinsic-dimension-of-proteins + +def estimate_persistent_homology_dimension_avg(sequence, model, tokenizer, num_subsets=5, num_iterations=10): + """ + Estimate the persistent homology dimension of a given protein sequence. + + Parameters: + - sequence: A string representing the protein sequence. + - model: a model that computes embeddings from the prot seq, tested only with esm atm + - tokenizer: tokenizer fitting to the model. + - num_subsets: A positive integer indicating the number of subsets of the embedding vectors to use. Max of 2**n where n=len(sequence). + - num_iterations: A positive integer indicating the number of iterations for averaging. + + Returns: + - avg_phd: Average estimated persistent homology dimension. + """ + + phd_values = [] # List to store PHD values for each iteration + + for _ in range(num_iterations): + + # Tokenize the input and convert to tensors + inputs = tokenizer(sequence, return_tensors='pt') + + # Get the embeddings + with torch.no_grad(): + outputs = model(**inputs) + embeddings = outputs.last_hidden_state[0].numpy() + + # Remove the first and last embeddings ( and ) + embeddings = embeddings[1:-1] + + # Sizes for the subsets to sample + sizes = np.linspace(2, len(embeddings), num=num_subsets, dtype=int) + + # Prepare data for linear regression + x = [] + y = [] + + for size in sizes: + # Sample a subset of the embeddings + subset = np.random.choice(len(embeddings), size, replace=False) + subset_embeddings = embeddings[subset] + + # Compute the distance matrix + dist_matrix = np.sqrt(np.sum((subset_embeddings[:, None] - subset_embeddings)**2, axis=-1)) + + # Compute the minimum spanning tree + mst = minimum_spanning_tree(dist_matrix).toarray() + + # Calculate the persistent score E (the maximum edge length in the MST) + E = np.max(mst) + + # Append to the data for linear regression + x.append(np.log(size)) + y.append(np.log(E)) + + # Reshape for sklearn + X = np.array(x).reshape(-1, 1) + Y = np.array(y).reshape(-1, 1) + + # Linear regression + reg = LinearRegression().fit(X, Y) + + # Estimated Persistent Homology Dimension for this iteration + phd = 1 / (1 - reg.coef_[0][0]) + + phd_values.append(phd) + + avg_phd = np.mean(phd_values) # Average over all iterations + return avg_phd +# %% + +prot_seq = "ARNDCEQGHILKMFPSTWYVARNDCEQGHILKMFPSTWYV" +prot_seq_halfhalf = "ARNDCEQGHILKMFPSTWYVAAAAAAAAAAAAAAAAAAAA" +prot_seq_low_comp = "MMAAAMMAAAMMAAAMMAAAMMAAAMMAAAMMAAAMMAAA" +prot_seq_homo_rep = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" +prot_seq_homo_rep_halflength = "AAAAAAAAAAAAAAAAAAAA" + +alphabet = [ + 'A', + 'R', + 'N', + 'D', + 'C', + 'E', + 'Q', + 'G', + 'H', + 'I', + 'L', + 'K', + 'M', + 'F', + 'P', + 'S', + 'T', + 'W', + 'Y', + 'V' +] + +test_fasta_path = "C:/Users/maxsp/Work/prots_test_complexity.fasta" +num_sequences = sum(1 for _ in SeqIO.parse(test_fasta_path, "fasta")) + +# %% +# run on test fasta +background = get_background_from_fasta_no_alignment(test_fasta_path, alphabet, num_sequences) +KLD = compute_KLD_fasta(test_fasta_path, alphabet, background) + +# %% +# test intrinsic dim stuff +# Load the tokenizer and model +model_path = "facebook/esm2_t6_8M_UR50D" +tokenizer = AutoTokenizer.from_pretrained(model_path) +model = EsmModel.from_pretrained(model_path) + +estimate_persistent_homology_dimension_avg(prot_seq, model, tokenizer, num_subsets=2, num_iterations=10) +estimate_persistent_homology_dimension_avg(prot_seq_low_comp, model, tokenizer, num_subsets=2, num_iterations=10) \ No newline at end of file diff --git a/protein_lm/evaluation/scripts/Protein-gym.py b/protein_lm/evaluation/scripts/Protein-gym.py deleted file mode 100644 index 5d0d0db..0000000 --- a/protein_lm/evaluation/scripts/Protein-gym.py +++ /dev/null @@ -1,149 +0,0 @@ -###################################################################################################### -# Script: ProteinGym Eval script -# Authors: Maximilian Sprang, Muedi -# Date: 08/2023 -# Description: This script downloads and handles the zipped Protein Gym Csvs and preprocesses them -# to be able to tokenized by EMS/ProtBERT tokenizers. -# Tokenization is done and then the esm 630M Model is used to be finetuned on ProteinGyms data -# ATM only substitution data is implemented for the finetunning but both are preprocessed and the -# complete datasets saved as CSV. -# finetuning is done with the evaluaten libray, which we'll likely change to an own trainling loop -# to be more flexible with our own models. -###################################################################################################### -# %% -# huggingface -from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer -from datasets import load_dataset -from evaluate import load -# others -# import matplotlib.pyplot as plt -from datetime import datetime -import numpy as np -import pandas as pd -# http requests -import requests, zipfile, io, os - -# %% -# download substitutions, unzip, save to disk -path = "data/ProteinGym/" -sub_url = "https://marks.hms.harvard.edu/proteingym/ProteinGym_substitutions.zip" - -if os.path.exists(path + "ProteinGym_substitutions"): - print("substitution data is already here :)") -else: - print("download substitution data ...") - r = requests.get(sub_url) - z = zipfile.ZipFile(io.BytesIO(r.content)) - z.extractall(path) - -# download indels, unzip, save to disk -sub_url = "https://marks.hms.harvard.edu/proteingym/ProteinGym_indels.zip" - -if os.path.exists(path + "ProteinGym_indels"): - print("indel data is already here :)") -else: - print("download indel data ...") - r = requests.get(sub_url) - z = zipfile.ZipFile(io.BytesIO(r.content)) - z.extractall(path) - -# %% -# load substitution data, introduce whitespeces and CLS/EOS tokens -# save complete data as csv, load as HF dataset -if os.path.exists(path + "ProteinGym_substitutions.csv"): - print("preprocessing was already done, load csv") - dataset = load_dataset("csv", data_files=(path + "ProteinGym_substitutions.csv")) -else: - print("preprocess substitutions ...") - folder_path = "data/ProteinGym/ProteinGym_substitutions" - all_data = [] - for filename in os.listdir(folder_path): - if filename.endswith('.csv'): - file_path = os.path.join(folder_path, filename) - df = pd.read_csv(file_path) - all_data.append(df) - merged_data = pd.concat(all_data, ignore_index=True) - # Add spaces between each amino acid in the 'mutated_sequences' column - merged_data['mutated_sequence'] = merged_data['mutated_sequence'].apply(lambda seq: ' '.join(list(seq))) - # add cls and end tokens - merged_data['mutated_sequence'] = " " + merged_data['mutated_sequence'] + " " - # save csv - merged_data.to_csv(path + "ProteinGym_substitutions.csv", index=False) - dataset = load_dataset("csv", data_files=(path + "ProteinGym_substitutions.csv")) - del merged_data - -# %% tokenize, with esm2_t33_650M_UR50D, use same checkpoint for model -checkpoint = "facebook/esm2_t33_650M_UR50D" -tokenizer = AutoTokenizer.from_pretrained(checkpoint) -def tokenize(batch): - return tokenizer(batch["mutated_sequence"], truncation=True, padding='max_length', max_length=760) - -token_data = dataset.map(tokenize, batched=True) -data_collator = DataCollatorWithPadding(tokenizer=tokenizer) - -# rename and remove stuff to fit into dataloader seemlessly -# removes info we don't use in the network, as we only use tokens and binned scores -token_data = token_data.remove_columns(["DMS_score", "mutant", "mutated_sequence"]) -# binned scores are renamed to 'labels' -token_data = token_data.rename_column("DMS_score_bin", "labels") - -# Split the train dataset into train, valid, and test subsets -dict_train_test = token_data['train'].train_test_split(test_size=0.4, shuffle=True) -train_dataset = dict_train_test['train'] -test_dataset = dict_train_test['test'] - -# subset for testruns: -# train_dataset = train_dataset.select([x for x in range(200)]) -# test_dataset = test_dataset.select([x for x in range(100)]) - -# # here we could split into validation and test if needed -# dict_test_valid = test_dataset.train_test_split(test_size=0.5, shuffle=True) -# test_dataset = dict_test_valid['test'] -# valid_dataset = dict_test_valid['train'] -# %% taken from facebooks pretrained-finetuning notebook here: -# https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb#scrollTo=fc164b49 -supervised=True -if supervised: - # load model for seq classification - num_labels = 2 - model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels) - - model_name = checkpoint.split("/")[-1] - batch_size = 8 - - args = TrainingArguments( - f"{model_name}-finetuned-localization", - evaluation_strategy = "epoch", - save_strategy = "epoch", - learning_rate=2e-5, - per_device_train_batch_size=batch_size, - per_device_eval_batch_size=batch_size, - num_train_epochs=3, - weight_decay=0.01, - load_best_model_at_end=True, - metric_for_best_model="accuracy", - push_to_hub=False, - ) - - metric = load("accuracy") - - def compute_metrics(eval_pred): - predictions, labels = eval_pred - predictions = np.argmax(predictions, axis=1) - return metric.compute(predictions=predictions, references=labels) - - trainer = Trainer( - model, - args, - train_dataset=train_dataset, - eval_dataset=test_dataset, - tokenizer=tokenizer, - compute_metrics=compute_metrics, - ) - # run trainer, this will return eval loass andd accuracy every few steps - # and save this to the disk in the esm2* folder - trainer.train() -else: - # here we'll add a zero-shot eval script like this: - # https://github.com/facebookresearch/esm/blob/main/examples/variant-prediction/predict.py - pass \ No newline at end of file diff --git a/protein_lm/evaluation/scripts.py/download_proteingym_data.py b/protein_lm/evaluation/scripts/download_proteingym_data.py similarity index 100% rename from protein_lm/evaluation/scripts.py/download_proteingym_data.py rename to protein_lm/evaluation/scripts/download_proteingym_data.py diff --git a/protein_lm/evaluation/scripts.py/fitness_supervised.py b/protein_lm/evaluation/scripts/fitness_supervised.py similarity index 100% rename from protein_lm/evaluation/scripts.py/fitness_supervised.py rename to protein_lm/evaluation/scripts/fitness_supervised.py diff --git a/protein_lm/evaluation/scripts.py/fitness_zero_shot_AR.py b/protein_lm/evaluation/scripts/fitness_zero_shot_AR.py similarity index 100% rename from protein_lm/evaluation/scripts.py/fitness_zero_shot_AR.py rename to protein_lm/evaluation/scripts/fitness_zero_shot_AR.py diff --git a/protein_lm/evaluation/scripts.py/fitness_zero_shot_ESM.py b/protein_lm/evaluation/scripts/fitness_zero_shot_ESM.py similarity index 100% rename from protein_lm/evaluation/scripts.py/fitness_zero_shot_ESM.py rename to protein_lm/evaluation/scripts/fitness_zero_shot_ESM.py diff --git a/protein_lm_cuda.yml b/protein_lm_cuda.yml index 3ce6cb4..d189a0f 100644 --- a/protein_lm_cuda.yml +++ b/protein_lm_cuda.yml @@ -6,14 +6,19 @@ channels: - conda-forge - defaults dependencies: - - python>=3.8 + - python>=3.8,<3.11 - numpy - scipy - cudatoolkit=11.7 - pytorch-cuda=11.7 - pydantic>=2.0 + - wandb - rust + - biotite + - pyarrow + - pip - pip: + - cmake - transformers - datasets - accelerate