Datura-ai · jsrn2 · Apr 7, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1,11 +1,10 @@
-validators/state.json
-bittensor_subnet_template.egg-info/
-bittensor_subnet_template.egg-info
-template/__pycache__/
-__pycache__/
+*.egg-info
+__pycache__
 *.pyc
 state.json
 wandb/
 .vscode
 .envrc
-.idea/
+.idea/
+.DS_Store
+venv
diff --git a/Cortex.t.egg-info/PKG-INFO b/Cortex.t.egg-info/PKG-INFO
diff --git a/Cortex.t.egg-info/SOURCES.txt b/Cortex.t.egg-info/SOURCES.txt
diff --git a/Cortex.t.egg-info/dependency_links.txt b/Cortex.t.egg-info/dependency_links.txt
diff --git a/Cortex.t.egg-info/requires.txt b/Cortex.t.egg-info/requires.txt
diff --git a/Cortex.t.egg-info/top_level.txt b/Cortex.t.egg-info/top_level.txt
diff --git a/README.md b/README.md
@@ -18,9 +18,9 @@
 
 Introducing Bittensor Subnet 18 (Cortex.t): A Pioneering Platform for AI Development and Synthetic Data Generation.
 
-Cortex.t stands at the forefront of artificial intelligence, offering a dual-purpose solution that caters to the needs of app developers and innovators in the AI space. This platform is meticulously designed to deliver reliable, high-quality text and image responses through API usage, utilising the decentralised Bittensor network. It serves as a cornerstone for creating a fair, transparent, and manipulation-free environment for the incentivised production of intelligence (mining) and generation and fulfilment of diverse user prompts.
+Cortex.t stands at the forefront of artificial intelligence, offering a dual-purpose solution that caters to the needs of app developers and innovators in the AI space. This platform is meticulously designed to deliver reliable, high-quality text, image, and TTS responses through API usage, utilising the decentralised Bittensor network. It serves as a cornerstone for creating a fair, transparent, and manipulation-free environment for the incentivised production of intelligence (mining) and generation and fulfilment of diverse user prompts.
 
-Our initiative is a leap forward in redefining the reward system for text and image prompting with a commitment to providing stability and reassurance to developers. By focusing on the value delivered to clients, we alleviate the concerns of data inconsistencies that often plague app development. The quality of Cortex.t is seamlessly integrated within the Bittensor network, allowing developers to harness the power of multiple subnets and modalities by building directly onto an existing validator, or through an API key from [Corcel](https://corcel.io).
+Our initiative is a leap forward in redefining the reward system for text, image, and TTS prompting with a commitment to providing stability and reassurance to developers. By focusing on the value delivered to clients, we alleviate the concerns of data inconsistencies that often plague app development. The quality of Cortex.t is seamlessly integrated within the Bittensor network, allowing developers to harness the power of multiple subnets and modalities by building directly onto an existing validator, or through an API key from [Corcel](https://corcel.io).
 
 Cortex.t is also a transformative platform leveraging advanced AI models to generate synthetic prompt-response pairs. This novel method yields a comprehensive dataset of interactions, archived in wandb [wandb.ai/cortex-t/synthetic-QA](https://wandb.ai/cortex-t/synthetic-QA). The process involves recycling model outputs back into the system, using a prompt evolution and data augmentation strategy similar to Microsoft's approach in developing WizardLM. This enables the distillation of sophisticated AI models into smaller, yet efficient counterparts, mirroring the performance of their larger predecessors. Ultimately, Cortex.t democratizes access to high-end AI technology, encouraging innovation and customization.
 

diff --git a/bittensor_subnet_template.egg-info/SOURCES.txt b/bittensor_subnet_template.egg-info/SOURCES.txt
diff --git a/bittensor_subnet_template.egg-info/dependency_links.txt b/bittensor_subnet_template.egg-info/dependency_links.txt
diff --git a/bittensor_subnet_template.egg-info/top_level.txt b/bittensor_subnet_template.egg-info/top_level.txt
diff --git a/cortext/__init__.py b/cortext/__init__.py
@@ -45,6 +45,7 @@
 PROMPT_BLACKLIST_STAKE = 20000
 IMAGE_BLACKLIST_STAKE = 20000
 EMBEDDING_BLACKLIST_STAKE = 20000
+TTS_BLACKLIST_STAKE = 20000
 ISALIVE_BLACKLIST_STAKE = min(PROMPT_BLACKLIST_STAKE, IMAGE_BLACKLIST_STAKE, EMBEDDING_BLACKLIST_STAKE)
 MIN_REQUEST_PERIOD = 2
 MAX_REQUESTS = 12
@@ -3737,6 +3738,16 @@
     'Mystical Landscapes'
 ]
 
+TTS_THEMES = [
+    'News and weather',
+    'Advertising',
+    'Health and wellness',
+    'Historical events',
+    'Sports news',
+    'Trivia',
+    'Paparaazi',
+]
+
 
 # Import all submodules.
 from . import protocol

diff --git a/cortext/protocol.py b/cortext/protocol.py
@@ -383,4 +383,49 @@ class TextPrompting(bt.Synapse):
         default=True,
         title="streaming",
         description="whether to stream the output",
-    )
+    )
+
+
+class TTSResponse(bt.Synapse):
+    """ A class to represent the TTS request and response. """
+
+    text: str = pydantic.Field(
+        title="Text",
+        description="The input text to generate audio for."
+    )
+
+    provider: Literal["ElevenLabs"] = pydantic.Field(
+        default="ElevenLabs",
+        title="Provider",
+        description="The provider to use when calling for your response."
+    )
+
+    model: str = pydantic.Field(
+        default="eleven_multilingual_v2",
+        title="Model",
+        description="The model used for TTS."
+    )
+
+    voice: str = pydantic.Field(
+        default="Rachel",
+        title="Voice",
+        description="The voice used for TTS."
+    )
+
+    audio_b64: Optional[str] = pydantic.Field(
+        None,
+        title="Base64-encoded audio",
+        description="The resulting audio in base64 encoding corresponding to the input text."
+    )
+
+    uid: int = pydantic.Field(
+        default=60,
+        title="uid",
+        description="The UID to send the synapse to",
+    )
+
+    timeout: int = pydantic.Field(
+        default=60,
+        title="timeout",
+        description="The timeout for the dendrite of the synapse",
+    )
diff --git a/cortext/reward.py b/cortext/reward.py
@@ -26,21 +26,26 @@
 import torch
 import openai
 import typing
+import string
 import difflib
 import asyncio
 import logging
 import aiohttp
 import requests
 import traceback
+import jiwer
 import numpy as np
 from numpy.linalg import norm
 import bittensor as bt
 from cortext import utils
 from PIL import Image
 from scipy.spatial.distance import cosine
+import scipy.stats
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.feature_extraction.text import TfidfVectorizer
 from transformers import CLIPProcessor, CLIPModel
+from speechmos import dnsmos
+from faster_whisper import WhisperModel
 
 # ==== TEXT ====
 
@@ -264,3 +269,41 @@ async def embeddings_score_dot(openai_answer: list, response: list, weight: floa
 
     bt.logging.info(f"Average embeddings cosine similarity does not exceed threshold: {avg_cosine_similarity}")
     return 0
+
+# ==== TTS =====
+def get_whisper_model(
+    model_type: typing.Literal["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"],
+    device: str = "cpu",
+    compute_type: typing.Literal["float16", "float32", "bfloat16", "int8", "int8_float16"] = "int8",
+    max_length: int = 2048,
+) -> WhisperModel:
+    model = WhisperModel(model_type, device=device, compute_type=compute_type)
+    model.max_length = max_length
+    return model
+
+def calculate_odds(mean_observed: float, n: int, mean_y: float, var_y: float) -> np.ndarray:
+    std_y = var_y ** 0.5
+    Z = (mean_observed - mean_y) / (std_y / (n**0.5))
+
+    # Calculate the probability using the CDF
+    probability = scipy.stats.norm.cdf(Z)
+    odds = probability / (1 - probability)
+
+    return odds
+
+def lower_remove_punctuation(text: str) -> str:
+    return text.lower().translate(str.maketrans('', '', string.punctuation)).strip()
+
+def calculate_wer(audio, model: WhisperModel, text: str) -> float:
+    with torch.inference_mode():
+        segments, _ = model.transcribe(audio, beam_size=5, max_new_tokens=4000, without_timestamps=True, language=None)
+        output = " ".join(x.text for x in segments)
+
+    output = lower_remove_punctuation(output)
+    text = lower_remove_punctuation(text)
+    wer = jiwer.wer(output, text)
+
+    return wer
+
+def dnsmos_score(audio, sr: int) -> float:
+    return dnsmos.run(audio, sr=sr, return_df=True, verbose=False)['ovrl_mos']