diff --git a/.gitignore b/.gitignore
index 9051b46..5270c16 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 protein_lm/dataset/ProteinGym/
 protein_lm/evaluation/output/
 /esm2*/
-/toy/
+/toy*/
 *.lock
 *.pyc
 wandb/
@@ -9,3 +9,4 @@ checkpoints/
 __pycache__/
 protein_lm.egg-info/
 *.DS_Store
+/.vs/
diff --git a/protein_lm/configs/train/toy_hf.yaml b/protein_lm/configs/train/toy_hf.yaml
index 8ee01d4..c75ff09 100644
--- a/protein_lm/configs/train/toy_hf.yaml
+++ b/protein_lm/configs/train/toy_hf.yaml
@@ -49,6 +49,8 @@ model:
   nn_model_type: "APT"
   nn_model_config_args:
     position_embedding: "learned"
+    rope_scaling_factor: 1.0
+    rope_theta: 10000
     max_sequence_length: 10
   pretrained_checkpoint: null
 
diff --git a/protein_lm/dataset/clustering_metrics.py b/protein_lm/dataset/clustering_metrics.py
new file mode 100644
index 0000000..05cf273
--- /dev/null
+++ b/protein_lm/dataset/clustering_metrics.py
@@ -0,0 +1,251 @@
+# %%
+from math import log2
+from Bio import SeqIO
+
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from transformers import AutoTokenizer, AutoModel, EsmModel
+import torch
+from scipy.sparse.csgraph import minimum_spanning_tree
+
+
+
+# %%
+# simple metrics
+def get_AA_counts(seq, alphabet):
+    """computes the counts of AAs in a seq"""
+    AA_counts = {x:0 for x in alphabet}
+    for AA in alphabet:
+        AA_counts[AA] = seq.count(AA)
+    return AA_counts
+
+def get_frquency(seq, alphabet):
+    """computes the frequency of AAs in seq"""
+    AA_counts = get_AA_counts(seq, alphabet)
+    return {k: v/len(seq) for k,v in AA_counts.items()}
+
+
+def compute_entropy(seq, alphabet):
+    """
+    computes single seq entropy, 
+    as zero freq AAs are a possibility, we skip zero frqs in the sumation. 
+    I thought that would be better than smoothing with 1 or another constant, but I am not sure. 
+    I found a paper where it is also done that way: 
+    https://academic.oup.com/mbe/article/40/4/msad084/7111731
+    """
+    AA_counts = get_frquency(seq, alphabet)
+    E_dict = {k: v * log2(v) for k, v in AA_counts.items() if v != 0}
+    E = - sum(E_dict.values())
+    return E 
+
+def compute_kullback_leibler(seq, alphabet, background_freq=None):
+    """
+    computesKL-divergence given a background frequency  
+    as zero freq AAs are a possibility, we skip zero frqs in the sumation. 
+    if AAs in background equal zero, this will also throw an error!
+    TODO: either drop similarly to AA freq in seq or smooth?
+    """
+    # set background to 0.05 for each if none given
+    if background_freq == None:
+        background_freq = {'A': 0.05,
+            'R': 0.05,
+            'N': 0.05,
+            'D': 0.05,
+            'C': 0.05,
+            'E': 0.05,
+            'Q': 0.05,
+            'G': 0.05,
+            'H': 0.05,
+            'I': 0.05,
+            'L': 0.05,
+            'K': 0.05,
+            'M': 0.05,
+            'F': 0.05,
+            'P': 0.05,
+            'S': 0.05,
+            'T': 0.05,
+            'W': 0.05,
+            'Y': 0.05,
+            'V': 0.05
+        }
+
+    AA_counts = get_frquency(seq, alphabet)
+    KLD_dict = {k: v * log2(v/background_freq[k]) for k, v in AA_counts.items() if v != 0} 
+    KLD = sum(KLD_dict.values())
+    return KLD 
+
+
+def process_sequence(rec):
+    """
+    Process a single sequence and return counts and frequencies
+    """
+    seq = str(rec.seq)
+    counts = get_AA_counts(seq, alphabet)
+    frequency = get_frquency(seq, alphabet)
+    return counts, frequency
+
+
+def get_background_from_fasta_no_alignment(fasta_file, alphabet, num_seqs):
+    """
+    iterates over fasta to get the AA frequencies of all seqs.
+    """
+    fasta_iterator = SeqIO.parse(fasta_file, "fasta")
+
+    # Initialize dictionaries to store counts and frequencies
+    total_frequency = {x: 0 for x in alphabet}
+
+    # Iterate over the records in the FASTA file
+    for record in fasta_iterator:
+        # Get sequence as a string
+        seq = str(record.seq)
+        
+        # Compute counts and frequencies for this sequence
+        frequency = get_frquency(seq, alphabet)
+        
+        # Accumulate counts and frequencies
+        for aa in alphabet:
+            total_frequency[aa] += frequency[aa]
+
+    # Normalize frequencies by the number of sequences
+    num_sequences = sum(1 for _ in SeqIO.parse(fasta_file, "fasta"))
+    for aa in alphabet:
+        total_frequency[aa] /= num_seqs
+    
+    return total_frequency
+
+def compute_KLD_fasta(fasta_file, alphabet, background_freq):
+    """
+    computes the KLD witht the background of the given fasta.
+    """
+
+    fasta_iterator = SeqIO.parse(fasta_file, "fasta")
+
+    KLDs = {}
+
+    for rec in fasta_iterator:
+        # get ID and seq, pack into dict as id:KLD
+        KLDs[rec.id] = compute_kullback_leibler(str(rec.seq), alphabet, background_freq)
+
+    return KLDs
+
+# %%
+# intrinsic dimension as suggested by @Amelie-Schreiber
+# https://huggingface.co/blog/AmelieSchreiber/intrinsic-dimension-of-proteins
+
+def estimate_persistent_homology_dimension_avg(sequence, model, tokenizer, num_subsets=5, num_iterations=10):
+    """
+    Estimate the persistent homology dimension of a given protein sequence.
+    
+    Parameters:
+    - sequence: A string representing the protein sequence.
+    - model: a model that computes embeddings from the prot seq, tested only with esm atm
+    - tokenizer: tokenizer fitting to the model.
+    - num_subsets: A positive integer indicating the number of subsets of the embedding vectors to use. Max of 2**n where n=len(sequence). 
+    - num_iterations: A positive integer indicating the number of iterations for averaging.
+    
+    Returns:
+    - avg_phd: Average estimated persistent homology dimension.
+    """
+    
+    phd_values = []  # List to store PHD values for each iteration
+    
+    for _ in range(num_iterations):
+        
+        # Tokenize the input and convert to tensors
+        inputs = tokenizer(sequence, return_tensors='pt')
+
+        # Get the embeddings
+        with torch.no_grad():
+            outputs = model(**inputs)
+            embeddings = outputs.last_hidden_state[0].numpy()
+
+        # Remove the first and last embeddings (<CLS> and <EOS>)
+        embeddings = embeddings[1:-1]
+
+        # Sizes for the subsets to sample
+        sizes = np.linspace(2, len(embeddings), num=num_subsets, dtype=int)
+
+        # Prepare data for linear regression
+        x = []
+        y = []
+
+        for size in sizes:
+            # Sample a subset of the embeddings
+            subset = np.random.choice(len(embeddings), size, replace=False)
+            subset_embeddings = embeddings[subset]
+
+            # Compute the distance matrix
+            dist_matrix = np.sqrt(np.sum((subset_embeddings[:, None] - subset_embeddings)**2, axis=-1))
+
+            # Compute the minimum spanning tree
+            mst = minimum_spanning_tree(dist_matrix).toarray()
+
+            # Calculate the persistent score E (the maximum edge length in the MST)
+            E = np.max(mst)
+
+            # Append to the data for linear regression
+            x.append(np.log(size))
+            y.append(np.log(E))
+
+        # Reshape for sklearn
+        X = np.array(x).reshape(-1, 1)
+        Y = np.array(y).reshape(-1, 1)
+
+        # Linear regression
+        reg = LinearRegression().fit(X, Y)
+
+        # Estimated Persistent Homology Dimension for this iteration
+        phd = 1 / (1 - reg.coef_[0][0])
+        
+        phd_values.append(phd)
+    
+    avg_phd = np.mean(phd_values)  # Average over all iterations
+    return avg_phd
+# %%
+
+prot_seq = "ARNDCEQGHILKMFPSTWYVARNDCEQGHILKMFPSTWYV"
+prot_seq_halfhalf = "ARNDCEQGHILKMFPSTWYVAAAAAAAAAAAAAAAAAAAA"
+prot_seq_low_comp = "MMAAAMMAAAMMAAAMMAAAMMAAAMMAAAMMAAAMMAAA"
+prot_seq_homo_rep = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+prot_seq_homo_rep_halflength = "AAAAAAAAAAAAAAAAAAAA"
+
+alphabet = [
+    'A',
+    'R',
+    'N',
+    'D',
+    'C',
+    'E',
+    'Q',
+    'G',
+    'H',
+    'I',
+    'L',
+    'K',
+    'M',
+    'F',
+    'P',
+    'S',
+    'T',
+    'W',
+    'Y',
+    'V'
+]
+
+test_fasta_path = "C:/Users/maxsp/Work/prots_test_complexity.fasta"
+num_sequences = sum(1 for _ in SeqIO.parse(test_fasta_path, "fasta"))
+
+# %%
+# run on test fasta
+background = get_background_from_fasta_no_alignment(test_fasta_path, alphabet, num_sequences)
+KLD = compute_KLD_fasta(test_fasta_path, alphabet, background)
+
+# %%
+# test intrinsic dim stuff
+# Load the tokenizer and model
+model_path = "facebook/esm2_t6_8M_UR50D"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model = EsmModel.from_pretrained(model_path)
+
+estimate_persistent_homology_dimension_avg(prot_seq, model, tokenizer, num_subsets=2, num_iterations=10)
+estimate_persistent_homology_dimension_avg(prot_seq_low_comp, model, tokenizer, num_subsets=2, num_iterations=10)
\ No newline at end of file
diff --git a/protein_lm/evaluation/scripts/Protein-gym.py b/protein_lm/evaluation/scripts/Protein-gym.py
deleted file mode 100644
index 5d0d0db..0000000
--- a/protein_lm/evaluation/scripts/Protein-gym.py
+++ /dev/null
@@ -1,149 +0,0 @@
-######################################################################################################
-# Script: ProteinGym Eval script
-# Authors: Maximilian Sprang, Muedi
-# Date: 08/2023
-# Description: This script downloads and handles the zipped Protein Gym Csvs and preprocesses them 
-# to be able to tokenized by EMS/ProtBERT tokenizers.
-# Tokenization is done and then the esm 630M Model is used to be finetuned on ProteinGyms data
-# ATM only substitution data is implemented for the finetunning but both are preprocessed and the
-# complete datasets saved as CSV.
-# finetuning is done with the evaluaten libray, which we'll likely change to an own trainling loop
-# to be more flexible with our own models.  
-######################################################################################################
-# %%
-# huggingface
-from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
-from datasets import load_dataset
-from evaluate import load
-# others
-# import matplotlib.pyplot as plt
-from datetime import datetime
-import numpy as np
-import pandas as pd
-# http requests
-import requests, zipfile, io, os
-
-# %%
-# download substitutions, unzip, save to disk
-path = "data/ProteinGym/"
-sub_url = "https://marks.hms.harvard.edu/proteingym/ProteinGym_substitutions.zip"
-
-if os.path.exists(path + "ProteinGym_substitutions"):
-    print("substitution data is already here :)")
-else:
-    print("download substitution data ...")
-    r = requests.get(sub_url)
-    z = zipfile.ZipFile(io.BytesIO(r.content))
-    z.extractall(path)
-
-# download indels, unzip, save to disk
-sub_url = "https://marks.hms.harvard.edu/proteingym/ProteinGym_indels.zip"
-
-if os.path.exists(path + "ProteinGym_indels"):
-    print("indel data is already here :)")
-else:
-    print("download indel data ...")
-    r = requests.get(sub_url)
-    z = zipfile.ZipFile(io.BytesIO(r.content))
-    z.extractall(path)
-
-# %%
-# load substitution data, introduce whitespeces and CLS/EOS tokens
-# save complete data as csv, load as HF dataset
-if os.path.exists(path + "ProteinGym_substitutions.csv"):
-    print("preprocessing was already done, load csv")
-    dataset = load_dataset("csv", data_files=(path + "ProteinGym_substitutions.csv"))
-else:
-    print("preprocess substitutions ...")
-    folder_path = "data/ProteinGym/ProteinGym_substitutions"
-    all_data = []
-    for filename in os.listdir(folder_path):
-        if filename.endswith('.csv'):
-            file_path = os.path.join(folder_path, filename)
-            df = pd.read_csv(file_path)
-            all_data.append(df)
-    merged_data = pd.concat(all_data, ignore_index=True)
-    # Add spaces between each amino acid in the 'mutated_sequences' column
-    merged_data['mutated_sequence'] = merged_data['mutated_sequence'].apply(lambda seq: ' '.join(list(seq)))
-    # add cls and end tokens
-    merged_data['mutated_sequence'] = "<cls> " + merged_data['mutated_sequence'] + " <eos>"
-    # save csv
-    merged_data.to_csv(path + "ProteinGym_substitutions.csv", index=False)
-    dataset = load_dataset("csv", data_files=(path + "ProteinGym_substitutions.csv"))
-    del merged_data
-
-# %% tokenize, with esm2_t33_650M_UR50D, use same checkpoint for model
-checkpoint = "facebook/esm2_t33_650M_UR50D"
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-def tokenize(batch):
-    return tokenizer(batch["mutated_sequence"], truncation=True, padding='max_length', max_length=760)
-
-token_data = dataset.map(tokenize, batched=True)
-data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-
-# rename and remove stuff to fit into dataloader seemlessly
-# removes info we don't use in the network, as we only use tokens and binned scores
-token_data = token_data.remove_columns(["DMS_score", "mutant", "mutated_sequence"])
-# binned scores are renamed to 'labels'
-token_data = token_data.rename_column("DMS_score_bin", "labels")
-
-# Split the train dataset into train, valid, and test subsets
-dict_train_test = token_data['train'].train_test_split(test_size=0.4, shuffle=True)
-train_dataset = dict_train_test['train']
-test_dataset = dict_train_test['test']
-
-# subset for testruns:
-# train_dataset = train_dataset.select([x for x in range(200)])
-# test_dataset = test_dataset.select([x for x in range(100)])
-
-# # here we could split into validation and test if needed
-# dict_test_valid = test_dataset.train_test_split(test_size=0.5, shuffle=True)
-# test_dataset = dict_test_valid['test']
-# valid_dataset = dict_test_valid['train']
-# %%  taken from facebooks pretrained-finetuning notebook here: 
-# https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb#scrollTo=fc164b49
-supervised=True
-if supervised:
-    # load model for seq classification
-    num_labels = 2
-    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
-
-    model_name = checkpoint.split("/")[-1]
-    batch_size = 8
-
-    args = TrainingArguments(
-        f"{model_name}-finetuned-localization",
-        evaluation_strategy = "epoch",
-        save_strategy = "epoch",
-        learning_rate=2e-5,
-        per_device_train_batch_size=batch_size,
-        per_device_eval_batch_size=batch_size,
-        num_train_epochs=3,
-        weight_decay=0.01,
-        load_best_model_at_end=True,
-        metric_for_best_model="accuracy",
-        push_to_hub=False,
-    )
-
-    metric = load("accuracy")
-
-    def compute_metrics(eval_pred):
-        predictions, labels = eval_pred
-        predictions = np.argmax(predictions, axis=1)
-        return metric.compute(predictions=predictions, references=labels)
-
-    trainer = Trainer(
-        model,
-        args,
-        train_dataset=train_dataset,
-        eval_dataset=test_dataset,
-        tokenizer=tokenizer,
-        compute_metrics=compute_metrics,
-    )
-    # run trainer, this will return eval loass andd accuracy every few steps
-    # and save this to the disk in the esm2* folder
-    trainer.train()
-else:
-    # here we'll add a zero-shot eval script like this: 
-    # https://github.com/facebookresearch/esm/blob/main/examples/variant-prediction/predict.py
-    pass
\ No newline at end of file
diff --git a/protein_lm/evaluation/scripts.py/download_proteingym_data.py b/protein_lm/evaluation/scripts/download_proteingym_data.py
similarity index 100%
rename from protein_lm/evaluation/scripts.py/download_proteingym_data.py
rename to protein_lm/evaluation/scripts/download_proteingym_data.py
diff --git a/protein_lm/evaluation/scripts.py/fitness_supervised.py b/protein_lm/evaluation/scripts/fitness_supervised.py
similarity index 100%
rename from protein_lm/evaluation/scripts.py/fitness_supervised.py
rename to protein_lm/evaluation/scripts/fitness_supervised.py
diff --git a/protein_lm/evaluation/scripts.py/fitness_zero_shot_AR.py b/protein_lm/evaluation/scripts/fitness_zero_shot_AR.py
similarity index 100%
rename from protein_lm/evaluation/scripts.py/fitness_zero_shot_AR.py
rename to protein_lm/evaluation/scripts/fitness_zero_shot_AR.py
diff --git a/protein_lm/evaluation/scripts.py/fitness_zero_shot_ESM.py b/protein_lm/evaluation/scripts/fitness_zero_shot_ESM.py
similarity index 100%
rename from protein_lm/evaluation/scripts.py/fitness_zero_shot_ESM.py
rename to protein_lm/evaluation/scripts/fitness_zero_shot_ESM.py
diff --git a/protein_lm_cuda.yml b/protein_lm_cuda.yml
index 3ce6cb4..d189a0f 100644
--- a/protein_lm_cuda.yml
+++ b/protein_lm_cuda.yml
@@ -6,14 +6,19 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - python>=3.8
+  - python>=3.8,<3.11
   - numpy
   - scipy
   - cudatoolkit=11.7
   - pytorch-cuda=11.7
   - pydantic>=2.0
+  - wandb
   - rust
+  - biotite
+  - pyarrow
+  - pip
   - pip:
+      - cmake
       - transformers
       - datasets
       - accelerate