diff --git a/.gitignore b/.gitignore index 08519c7..dc79d9f 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,10 @@ models generated .idea .DS_Store +milvus/seed/data/* +milvus/build/volumes/milvus/*data* +*.venv +*venv # UI assets **/node_modules diff --git a/milvus/build/Containerfile b/milvus/build/Containerfile new file mode 100644 index 0000000..779a32b --- /dev/null +++ b/milvus/build/Containerfile @@ -0,0 +1,2 @@ +FROM docker.io/milvusdb/milvus:master-20240426-bed6363f +ADD embedEtcd.yaml /milvus/configs/embedEtcd.yaml diff --git a/milvus/build/Makefile b/milvus/build/Makefile new file mode 100644 index 0000000..cdf6bea --- /dev/null +++ b/milvus/build/Makefile @@ -0,0 +1,55 @@ +REGISTRY ?= quay.io +REGISTRY_ORG ?= ai-lab +COMPONENT = vector_dbs + +IMAGE ?= $(REGISTRY)/$(REGISTRY_ORG)/$(COMPONENT)/milvus:latest + +ARCH ?= $(shell uname -m) +PLATFORM ?= linux/$(ARCH) + +gRCP_PORT := 19530 +REST_PORT := 9091 +CLIENT_PORT := 2379 + +LIB_MILVUS_DIR_MOUNTPATH := $(shell pwd)/volumes/milvus + +.PHONY: build +build: + podman build --platform $(PLATFORM) -f Containerfile -t ${IMAGE} . + +.PHONY: run +run: + podman run -d \ + --name milvus-standalone \ + --security-opt seccomp:unconfined \ + -e ETCD_USE_EMBED=true \ + -e ETCD_CONFIG_PATH=/milvus/configs/embedEtcd.yaml \ + -e COMMON_STORAGETYPE=local \ + -v $(LIB_MILVUS_DIR_MOUNTPATH):/var/lib/milvus \ + -p $(gRCP_PORT):$(gRCP_PORT) \ + -p $(REST_PORT):$(REST_PORT) \ + -p $(CLIENT_PORT):$(CLIENT_PORT) \ + --health-cmd="curl -f http://localhost:$(REST_PORT)/healthz" \ + --health-interval=30s \ + --health-start-period=90s \ + --health-timeout=20s \ + --health-retries=3 \ + $(IMAGE) \ + milvus run standalone 1> /dev/null + +.PHONY: stop +stop: + -podman stop milvus-standalone + +.PHONY: delete +delete: + -podman rm milvus-standalone -f + +.PHONY: podman-clean +podman-clean: + @container_ids=$$(podman ps -a --format "{{.ID}} {{.Image}}" | awk '$$2 == "$(IMAGE)" {print $$1}'); \ + echo "removing all containers with IMAGE=$(IMAGE)"; \ + for id in $$container_ids; do \ + echo "Removing container: $$id,"; \ + podman rm -f $$id; \ + done diff --git a/milvus/build/embedEtcd.yaml b/milvus/build/embedEtcd.yaml new file mode 100644 index 0000000..32954fa --- /dev/null +++ b/milvus/build/embedEtcd.yaml @@ -0,0 +1,5 @@ +listen-client-urls: http://0.0.0.0:2379 +advertise-client-urls: http://0.0.0.0:2379 +quota-backend-bytes: 4294967296 +auto-compaction-mode: revision +auto-compaction-retention: '1000' diff --git a/milvus/build/volumes/Containerfile b/milvus/build/volumes/Containerfile new file mode 100644 index 0000000..779a32b --- /dev/null +++ b/milvus/build/volumes/Containerfile @@ -0,0 +1,2 @@ +FROM docker.io/milvusdb/milvus:master-20240426-bed6363f +ADD embedEtcd.yaml /milvus/configs/embedEtcd.yaml diff --git a/milvus/build/volumes/Makefile b/milvus/build/volumes/Makefile new file mode 100644 index 0000000..1113215 --- /dev/null +++ b/milvus/build/volumes/Makefile @@ -0,0 +1,55 @@ +REGISTRY ?= quay.io +REGISTRY_ORG ?= ai-lab +COMPONENT = vector_dbs + +IMAGE ?= $(REGISTRY)/$(REGISTRY_ORG)/$(COMPONENT)/milvus:latest + +ARCH ?= $(shell uname -m) +PLATFORM ?= linux/$(ARCH) + +gRCP_PORT := 19530 +REST_PORT := 9091 +CLIENT_PORT := 2379 + +LIB_MILVUS_DIR_MOUNTPATH := $(shell pwd)/volumes/milvus + +.PHONY: build +build: + podman build --platform $(PLATFORM) -f Containerfile -t ${IMAGE} . + +.PHONY: run +run: + podman run -it \ + --name milvus-standalone \ + --security-opt seccomp:unconfined \ + -e ETCD_USE_EMBED=true \ + -e ETCD_CONFIG_PATH=/milvus/configs/embedEtcd.yaml \ + -e COMMON_STORAGETYPE=local \ + -v $(LIB_MILVUS_DIR_MOUNTPATH):/var/lib/milvus \ + -p $(gRCP_PORT):$(gRCP_PORT) \ + -p $(REST_PORT):$(REST_PORT) \ + -p $(CLIENT_PORT):$(CLIENT_PORT) \ + --health-cmd="curl -f http://localhost:$(REST_PORT)/healthz" \ + --health-interval=30s \ + --health-start-period=90s \ + --health-timeout=20s \ + --health-retries=3 \ + $(IMAGE) \ + milvus run standalone 1> /dev/null + +.PHONY: stop +stop: + -podman stop milvus-standalone + +.PHONY: delete +delete: + -podman rm milvus-standalone -f + +.PHONY: podman-clean +podman-clean: + @container_ids=$$(podman ps --format "{{.ID}} {{.Image}}" | awk '$$2 == "$(IMAGE)" {print $$1}'); \ + echo "removing all containers with IMAGE=$(IMAGE)"; \ + for id in $$container_ids; do \ + echo "Removing container: $$id,"; \ + podman rm -f $$id; \ + done diff --git a/milvus/build/volumes/embedEtcd.yaml b/milvus/build/volumes/embedEtcd.yaml new file mode 100644 index 0000000..32954fa --- /dev/null +++ b/milvus/build/volumes/embedEtcd.yaml @@ -0,0 +1,5 @@ +listen-client-urls: http://0.0.0.0:2379 +advertise-client-urls: http://0.0.0.0:2379 +quota-backend-bytes: 4294967296 +auto-compaction-mode: revision +auto-compaction-retention: '1000' diff --git a/milvus/build/volumes/milvus/.gitkeep b/milvus/build/volumes/milvus/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/milvus/seed/.env.example b/milvus/seed/.env.example new file mode 100644 index 0000000..798bc46 --- /dev/null +++ b/milvus/seed/.env.example @@ -0,0 +1,3 @@ +MODEL_NAME= +MODEL_ENDPOINT= +MODEL_TOKEN= \ No newline at end of file diff --git a/milvus/seed/README.md b/milvus/seed/README.md new file mode 100644 index 0000000..5df33a2 --- /dev/null +++ b/milvus/seed/README.md @@ -0,0 +1,29 @@ +RAG application with ILAB + +1. setup a vector DB (Milvus) + +Development story: + 0. Starting Goal: + - Naive RAG no KG aided + - Addition: + 1. identify what the model lacks knowledge in + 2. Can I use the interal trained model or do I have to use the HF model + - + +- UI integration + +----------------------------------------------- + +variable definition +class Config + +_identify_params, +_llm_type, _extract_token_usage, + +Inherint in defining this spec which could eventually live as a contribution to langchain are some assumptions / questions I made: + - Is the model serializable: Assumed no + - Max tokens for merlinite and granite: Both assumed 4096 + - Does this model have attention / memmory? + - Does these models have a verbosity option for output? + - Recomended default values: + - \ No newline at end of file diff --git a/milvus/seed/__pycache__/ilab_model.cpython-311.pyc b/milvus/seed/__pycache__/ilab_model.cpython-311.pyc new file mode 100644 index 0000000..2b8da03 Binary files /dev/null and b/milvus/seed/__pycache__/ilab_model.cpython-311.pyc differ diff --git a/milvus/seed/__pycache__/merlinite_model.cpython-311.pyc b/milvus/seed/__pycache__/merlinite_model.cpython-311.pyc new file mode 100644 index 0000000..19d0734 Binary files /dev/null and b/milvus/seed/__pycache__/merlinite_model.cpython-311.pyc differ diff --git a/milvus/seed/client.py b/milvus/seed/client.py new file mode 100644 index 0000000..53d1e4e --- /dev/null +++ b/milvus/seed/client.py @@ -0,0 +1,66 @@ +import requests +import json +import os +from ilab_model import IlabLLM +from dotenv import load_dotenv +from langchain_core.prompts import PromptTemplate +from langchain.chains import LLMChain + +load_dotenv() + +# manage ENV +model_endpoint=os.getenv('MODEL_ENDPOINT') +if model_endpoint == "": + model_endpoint = "http://localhost:8001" + +model_name=os.getenv('MODEL_NAME') +if model_name == "": + model_name = "ibm/merlinite-7b" + +model_token=os.getenv('ILAB_API_TOKEN') + +# HTTPS client +# client_key_path = "/home/fedora/client-tls-key.pem2" +# client_crt_path = "/home/fedora/client-tls-crt.pem2" +# server_ca_crt = "/home/fedora/server-ca-crt.pem2" + +# ssl_context = ssl.create_default_context(cafile=server_ca_crt) +# ssl_context.load_cert_chain(certfile=client_crt_path, keyfile=client_key_path) + +# client = httpx.Client(verify=ssl_context) + +# data = { +# "model": "instructlab/granite-7b-lab", +# "messages": [ +# {"role": "system", "content": "your name is carl"}, +# {"role": "user", "content": "what is your name?"} +# ], +# "temperature": 1, +# "max_tokens": 1792, +# "top_p": 1, +# "repetition_penalty": 1.05, +# "stop": ["<|endoftext|>"], +# "logprobs": False, +# "stream": False +# } + +# response = requests.post(url, headers=headers, data=json.dumps(data), verify=False) +# print(response.json()) +print(f'model_name={model_name}') +llm = IlabLLM( + model_endpoint=model_endpoint, + model_name=model_name, + apikey=model_token, + temperature=1, + max_tokens=500, + top_p=1, + repetition_penalty=1.05, + stop=["<|endoftext|>"], + streaming=False +) + +prompt="I am training for a marathon in 12 weeks. Can you help me build an exercise plan to help prepare myself?" +prompts=[prompt] +# prompt_template = PromptTemplate.from_template(prompt) +llm.generate(prompts) +# llm.invoke("dog") diff --git a/milvus/seed/dumb_client.py b/milvus/seed/dumb_client.py new file mode 100644 index 0000000..e08c912 --- /dev/null +++ b/milvus/seed/dumb_client.py @@ -0,0 +1,40 @@ +import requests +import json +import os +from dotenv import load_dotenv + +load_dotenv() + +# manage ENV +model_endpoint=os.getenv('MODEL_ENDPOINT') +if model_endpoint == "": + model_endpoint = "http://localhost:8001" + +model_name=os.getenv('MODEL_NAME') +if model_name == "": + model_name = "ibm/merlinite-7b" + +model_token=os.getenv('MODEL_TOKEN') + +headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {model_token}" +} + +data = { + "model": model_name, + "messages": [ + {"role": "system", "content": "your name is carl"}, + {"role": "user", "content": "what is your name?"} + ], + "temperature": 1, + "max_tokens": 1792, + "top_p": 1, + "repetition_penalty": 1.05, + "stop": ["<|endoftext|>"], + "logprobs": False, + "stream": False +} + +response = requests.post(model_endpoint, headers=headers, data=json.dumps(data), verify=False) +print(response.json()) \ No newline at end of file diff --git a/milvus/seed/ilab_model.py b/milvus/seed/ilab_model.py new file mode 100644 index 0000000..bc0b009 --- /dev/null +++ b/milvus/seed/ilab_model.py @@ -0,0 +1,372 @@ +#!/bin/python3 + +## This is a langchain compatabible implementation for the Ilab models. It will remain in this repo until we publish APIKey +## functionality and route backendservice endpoints through a proxy that can be exposed, similary to openAI. At which point +## we can move this pr as a contribution to langchain and easily scale our usage! + +### Fixes in progress: + ### - override self params with calls invoke or generate for temperature, etc. + ### - test that invoke works, generate starts + ### - Feat: streaming implementation + ### - Callbacks with streaming + ### - Authentication enablement via user and password rather than just API keys + ### - Authentication checking for API keys (whole backend API setup) + ### - Utilize tags and metadata with langserve + ### - Allow logprobs as an option + +import os +import httpx +import requests +import json +from langchain_core.language_models.llms import BaseLLM +from dotenv import load_dotenv +from langchain_core.outputs import Generation, LLMResult +from langchain_core.pydantic_v1 import Field, SecretStr, root_validator +from langchain_core.utils import ( + convert_to_secret_str, + get_from_dict_or_env, + get_pydantic_field_names, +) +from langchain_core.utils.utils import build_extra_kwargs + +load_dotenv() +from typing import ( + Any, + Dict, + List, + Set, + Optional, + Mapping +) + +class IlabLLM(BaseLLM): + """ + Instructlab large language model. + + As this model is currently private, you must have pre-arranged access. + """ + + # REQUIRED PARAMS + + model_endpoint: str = "" + """The model Endpoint to Use""" + + model_name: str = Field(alias="model") + """Type of deployed model to use.""" + + # OPTIONAL BUT DEFAULTS + + system_prompt: Optional[str] = "You are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior." + """Default system prompt to use.""" + + model_kwargs: Dict[str, Any] = Field(default_factory=dict) + """Holds any model parameters valid for `create` call not explicitly specified.""" + + max_tokens: int = 4096 + """The maximum number of tokens to generate in the completion. + -1 returns as many tokens as possible given the prompt and + the models maximal context size.""" + + # TOTALLY OPTIONAL + + apikey: Optional[SecretStr] = None + """Apikey to the Ilab model APIs (merlinte or granite)""" + + top_p: Optional[float] = 1 + """Total probability mass of tokens to consider at each step.""" + + frequency_penalty: Optional[float] = 0 + """Penalizes repeated tokens according to frequency.""" + + repetition_penalty: Optional[float] = 0 + """Penalizes repeated tokens.""" + + temperature: Optional[float] = 0.7 + """What sampling temperature to use.""" + + # verbose: Optional[str] = None + # """If the model should return verbose output or standard""" + + streaming: bool = False + """ Whether to stream the results or not. """ + + # FUTURE EXTENSIONS + + tags: Optional[List[str]] = None + """Tags to add to the run trace.""" + + metadata: Optional[Dict[str, Any]] = None + """Metadata to add to the run trace.""" + + # This gets implemented with stream + # callbacks: Optional[SecretStr] = None + # """callbacks""" + + # END PARMS + + class Config: + """Configuration for this pydantic object.""" + allow_population_by_field_name = True + + @property + def lc_secrets(self) -> Dict[str, str]: + """A map of constructor argument names to secret ids. + + For example: + { + "apikey": "ILAB_API_KEY", + } + """ + return { + "apikey": "ILAB_API_KEY", + } + + @classmethod + def is_lc_serializable(cls) -> bool: + """Return whether this model can be serialized by Langchain.""" + return False + + @root_validator(pre=True) + def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]: + """Build extra kwargs from additional params that were passed in.""" + all_required_field_names = get_pydantic_field_names(cls) + extra = values.get("model_kwargs", {}) + values["model_kwargs"] = build_extra_kwargs( + extra, values, all_required_field_names + ) + return values + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + if values["streaming"] == True: + raise ValueError("streaming has not yet been implemented.") + if values["apikey"] or "ILAB_API_KEY" in os.environ: + values["apikey"] = convert_to_secret_str( + get_from_dict_or_env(values, "apikey", "ILAB_API_KEY") + ) + values['model_name'] = get_from_dict_or_env( + values, + "model_name", + "MODEL_NAME", + ) + ## extension for more options for required auth params + ## client_params = { + ## "api_key": ( + ## values["apikey"].get_secret_value() + ## if values["apikey"] + ## else None + ## ) + ## } + # CURRENTLY WE DONT CHECK KEYS + ## if not client_params['values']['apikey']: + ## raise ValueError("Did not find token `apikey`.") + return values + + @property + def _params(self) -> Mapping[str, Any]: + """Get the identifying parameters.""" + params = {**{ + "model_name": self.model_name, + "model_endpoint": self.model_endpoint, + }, **self._default_params} + if self.apikey: + params['apikey'] = self.apikey + if self.model_name: + params['model_name'] = self.model_name + return params + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters for calling Merlinite API.""" + normal_params: Dict[str, Any] = { + "temperature": self.temperature, + "top_p": self.top_p, + "frequency_penalty": self.frequency_penalty, + "presence_penalty": self.repetition_penalty, + } + + if self.max_tokens is not None: + normal_params["max_tokens"] = self.max_tokens + + return {**normal_params, **self.model_kwargs} + + + def _invocation_params(self) -> Dict[str, Any]: + """Get the parameters used to invoke the model.""" + return self._params + + def make_request(self, params: Dict[str, Any], prompt: str, stop: Optional[List[str]]) -> Dict[str, Any]: + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.apikey}" + } + + data = { + "model": params['model_name'], + "messages": [ + { + "role": "system", + "content": self.system_prompt + }, + { + "role": "user", + "content": prompt + } + ], + "temperature": params['temperature'], + "max_tokens": params['max_tokens'], + "top_p": params['top_p'], + "stop": stop, + "logprobs": False, + } + + if 'repetition_penalty' in params: + data["repetition_penalty"] = params['repetition_penalty'] + + if 'streaming' in params: + # Shadowing basemodel re-route for streaming + data["stream"] = params["streaming"] + + response = requests.post(self.model_endpoint, headers=headers, data=json.dumps(data), verify=False) + response_json = response.json() + + def _call(self, prompt: str, stop:Optional[List[str]] = None, **kwargs: Any) -> str: + """Call the ilab inference endpoint. The result of invoke. + Args: + prompt: The prompt to pass into the model. + stop: Optional list of stop words to use when generating. + run_manager: Optional callback manager. + Returns: + The string generated by the model. + Example: + .. code-block:: python + + response = merlinite.invoke("What is a molecule") + """ + + invocation_params = self._invocation_params + params = {**invocation_params, **kwargs} + + if stop == None: + stop = ["<|endoftext|>"] + response_json = self.make_request( + params=params, prompt=prompt, stop=stop, **kwargs + ) + return response_json['choices'][0]['messages']['content'] + + def _generate( + self, + prompts: List[str], + stop: Optional[List[str]] = None, + **kwargs: Any, + ) -> LLMResult: + """Call out to Ilab's endpoint with prompt. + + Args: + prompt: The prompt to pass into the model. + stop: Optional list of stop words to use when generating. + + Returns: + The full LLM output. + + Example: + .. code-block:: python + + response = ilab.generate(["Tell me a joke."]) + """ + + invocation_params = self._invocation_params() + params = {**invocation_params, **kwargs} + token_usage: Dict[str, int] = {} + system_fingerprint: Optional[str] = None + + response_json = self.make_request( + params=params, prompt=prompts[0], stop=stop, **kwargs + ) + + if not ('choices' in response_json and len(response_json['choices']) > 0): + raise ValueError("No valid response from the model") + + if response_json.get("error"): + raise ValueError(response_json.get("error")) + + if not system_fingerprint: + system_fingerprint = response_json.get("system_fingerprint") + return self._create_llm_result( + response_json=response_json, + ) + + def _llm_type(self) -> str: + """Get the type of language model used by this chat model. Used for logging purposes only.""" + return "instructlab" + + @property + def max_context_size(self) -> int: + """Get max context size for this model.""" + return self.modelname_to_contextsize(self.model_name) + + def _create_llm_result(self, response: List[dict]) -> LLMResult: + """Create the LLMResult from the choices and prompt.""" + generations = [] + for res in response: + results = res.get("results") + if results: + finish_reason = results[0].get("choices")[0].get('finished_reason') + gen = Generation( + text=results[0].get("choices")[0].get('message').get('content'), + generation_info={"finish_reason": finish_reason}, + ) + generations.append([gen]) + final_token_usage = self._extract_token_usage(response) + llm_output = { + "token_usage": final_token_usage, + "model_name": self.model_name + } + return LLMResult(generations=generations, llm_output=llm_output) + + @staticmethod + def _extract_token_usage( + response: Optional[List[Dict[str, Any]]] = None, + ) -> Dict[str, Any]: + if response is None: + return {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0} + + prompt_tokens = 0 + completion_tokens = 0 + total_tokens = 0 + + def get_count_value(key: str, result: Dict[str, Any]) -> int: + return result.get(key, 0) or 0 + + for res in response: + results = res.get("results") + if results: + prompt_tokens += get_count_value("prompt_tokens", results[0]) + completion_tokens += get_count_value( + "completion_tokens", results[0] + ) + total_tokens += get_count_value("total_tokens", results[0]) + + return { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens + } + + @staticmethod + def modelname_to_contextsize(modelname: str) -> int: + """Calculate the maximum number of tokens possible to generate for a model.""" + model_token_mapping = { + "ibm/merlinite-7b": 4096, + "instructlab/granite-7b-lab": 4096 + } + + context_size = model_token_mapping.get(modelname, None) + + if context_size is None: + raise ValueError( + f"Unknown model: {modelname}. Please provide a valid Ilab model name." + "Known models are: " + ", ".join(model_token_mapping.keys()) + ) + + return context_size diff --git a/milvus/seed/new_seed.py b/milvus/seed/new_seed.py new file mode 100644 index 0000000..60311c7 --- /dev/null +++ b/milvus/seed/new_seed.py @@ -0,0 +1,41 @@ +import os +from pymilvus import MilvusClient, DataType +from langchain_community.vectorstores import Milvus +from langchain_experimental.text_splitter import SemanticChunker +from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter +from langchain import hub +from langchain_core.runnables import RunnablePassthrough +from langchain_core.output_parsers import StrOutputParser +from tika import parser # pip install tika +from langchain_openai import OpenAI +from ilab_models import IlabOpenAILLM + + +def log_step(step_num, step_name) -> None: + print("-----------------------------------------------") + print(f"{step_num}. {step_name}") + print("-----------------------------------------------") + +embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") + +text_splitter = SemanticChunker(embeddings=embeddings) # fails + +loader = PyPDFLoader('./data/DnD-5e-Handbook.pdf') +data = loader.load() +split_data = text_splitter.split_documents(data) +print(len(split_data)) +vector_store = Milvus.from_documents( + documents=split_data, + embedding=embeddings, + connection_args={"host": "localhost", "port": 19530}, + collection_name="dnd" +) + +llm = IlabOpenAILLM( + +) + +retreiver = vector_store.as_retreiver() +prompt = hub.pull("rlm/rag-prompt") \ No newline at end of file diff --git a/milvus/seed/requirements-lock.txt b/milvus/seed/requirements-lock.txt new file mode 100644 index 0000000..0d2c865 --- /dev/null +++ b/milvus/seed/requirements-lock.txt @@ -0,0 +1,78 @@ +aiohttp==3.9.5 +aiosignal==1.3.1 +annotated-types==0.7.0 +anyio==4.3.0 +attrs==23.2.0 +beautifulsoup4==4.12.3 +bs4==0.0.2 +certifi==2024.2.2 +charset-normalizer==3.3.2 +dataclasses-json==0.6.6 +distro==1.9.0 +environs==9.5.0 +filelock==3.14.0 +frozenlist==1.4.1 +fsspec==2024.5.0 +grpcio==1.63.0 +h11==0.14.0 +httpcore==1.0.5 +httpx==0.27.0 +huggingface-hub==0.23.1 +idna==3.7 +Jinja2==3.1.4 +joblib==1.4.2 +jsonpatch==1.33 +jsonpointer==2.4 +langchain==0.2.1 +langchain-community==0.2.1 +langchain-core==0.2.1 +langchain-experimental==0.0.59 +langchain-openai==0.1.7 +langchain-text-splitters==0.2.0 +langsmith==0.1.63 +MarkupSafe==2.1.5 +marshmallow==3.21.2 +milvus-lite==2.4.5 +mpmath==1.3.0 +multidict==6.0.5 +mypy-extensions==1.0.0 +networkx==3.3 +numpy==1.26.4 +openai==1.30.3 +orjson==3.10.3 +packaging==23.2 +pandas==2.2.2 +pillow==10.3.0 +protobuf==5.27.0 +pydantic==2.7.1 +pydantic_core==2.18.2 +pymilvus==2.4.3 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +pytz==2024.1 +PyYAML==6.0.1 +regex==2024.5.15 +requests==2.32.2 +safetensors==0.4.3 +scikit-learn==1.5.0 +scipy==1.13.1 +sentence-transformers==2.7.0 +six==1.16.0 +sniffio==1.3.1 +soupsieve==2.5 +SQLAlchemy==2.0.30 +sympy==1.12 +tenacity==8.3.0 +threadpoolctl==3.5.0 +tika==2.6.0 +tiktoken==0.7.0 +tokenizers==0.19.1 +torch==2.3.0 +tqdm==4.66.4 +transformers==4.41.1 +typing-inspect==0.9.0 +typing_extensions==4.12.0 +tzdata==2024.1 +ujson==5.10.0 +urllib3==2.2.1 +yarl==1.9.4 diff --git a/milvus/seed/requirements.txt b/milvus/seed/requirements.txt new file mode 100644 index 0000000..431c4f8 --- /dev/null +++ b/milvus/seed/requirements.txt @@ -0,0 +1,10 @@ +pymilvus==2.4.3 +langchain==0.2.1 +langchain-community==0.2.1 +langchain-core==0.2.1 +langchain-openai==0.1.7 +langchain-experimental==0.0.59 +tika==2.6.0 +sentence-transformers==2.7.0 +beautifulsoup4==4.12.3 +python-dotenv==1.0.1 diff --git a/milvus/seed/seed.py b/milvus/seed/seed.py new file mode 100644 index 0000000..044158e --- /dev/null +++ b/milvus/seed/seed.py @@ -0,0 +1,83 @@ +import os +from pymilvus import MilvusClient, DataType +from langchain_community.vectorstores import Milvus +from langchain_experimental.text_splitter import SemanticChunker +from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader +from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter +from langchain import hub +from langchain_core.runnables import RunnablePassthrough +from langchain_core.output_parsers import StrOutputParser +from tika import parser # pip install tika + + +def log_step(step_num, step_name) -> None: + print("-----------------------------------------------") + print(f"{step_num}. {step_name}") + print("-----------------------------------------------") + +def milvus_init() -> MilvusClient: + client = MilvusClient() + if not client.has_connection('dnd'): + client.drop_connection('dnd') + return client + +def fill_dnd_collection(text_splitter: any, embeddings: any) -> None: + # local + # raw = parser.from_file("data/DnD-5e-Handbook.pdf") + # print(len(raw['content'])) + # docs = text_splitter.create_documents([raw['content']]) + # vector_store = Milvus.from_documents( + # docs, + # embedding=embeddings, + # connection_args={"host": "localhost", "port": 19530}, + # collection_name="dnd" + # ) + # remote + loader = PyPDFLoader('https://orkerhulen.dk/onewebmedia/DnD%205e%20Players%20Handbook%20%28BnW%20OCR%29.pdf') + data = loader.load() + split_data = text_splitter.split_documents(data) + vector_store = Milvus.from_documents( + documents=split_data, + embedding=embeddings, + connection_args={"host": "localhost", "port": 19530}, + collection_name="dnd" + ) + +def generate_embeddings() -> any: + # model_name = "ibm/merlinite-7b" + # model_kwargs={"device": "cuda"}, + # encode_kwargs = {"device": "cuda", "batch_size": 100, "normalize_embeddings": True} + model_name = "all-MiniLM-L6-v2" + model_kwargs = {"device": "cpu"} + encode_kwargs = {"normalize_embeddings": True} + embeddings = HuggingFaceBgeEmbeddings( + model_name=model_name, + # model_kwargs=model_kwargs, + encode_kwargs=encode_kwargs, + query_instruction = "search_query:", + embed_instruction = "search_document:" + ) + +def generate_text_splitter(chunk_size=512, chunk_overlap=50) -> any: + # text_splitter = SemanticChunker(embeddings=embeddings) # fails + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + length_function=len, + is_separator_regex=False + ) + return text_splitter + +log_step(0, "Generate embeddings") +embeddings = generate_embeddings() +log_step(1, "Init text splitter") +text_splitter = generate_text_splitter() +log_step(2, "Read Raw data from PDF") +log_step(3, "Text splitting") +log_step(4, "Log result") +fill_dnd_collection(embeddings=embeddings, text_splitter=text_splitter) + + +# retreiver = vector_store.as_retreiver() +# prompt = hub.pull("rlm/rag-prompt") \ No newline at end of file diff --git a/ui/compose.ui b/ui/compose.ui index 1a1c761..6428e11 100644 --- a/ui/compose.ui +++ b/ui/compose.ui @@ -81,3 +81,27 @@ services: devices: - driver: nvidia capabilities: [gpu] + + milvus: + container_name: milvus-standalone + image: quay.io/ai-lab/vector_dbs/milvus@sha256:7d0be442cbcafeebdb056c56b16f7f7fe96b235d7a82cfdd57edc302042d00eb + security_opt: + - seccomp:unconfined + environment: + ETCD_USE_EMBED: "true" + ETCD_CONFIG_PATH: "/milvus/configs/embedEtcd.yaml" + COMMON_STORAGETYPE: "local" + volumes: + - /home/fedora/milvus-volume:/var/lib/milvus + - /home/fedora/instruct-lab-bot/milvus/seed:data/milvus/seed + ports: + - 19530:19530 + - 9091:9091 + - 2379:2379 + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:9091/healthz || exit 1"] + interval: 30s + timeout: 20s + retries: 3 + start_period: 90s + command: milvus run standalone