intel · Liangyx2 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/intel_extension_for_transformers/neural_chat/prompts/prompt.py b/intel_extension_for_transformers/neural_chat/prompts/prompt.py
@@ -321,3 +321,39 @@ def generate_sqlcoder_prompt(qurey, metadata_file):
         qurey=qurey, table_metadata_string=table_metadata_string
     )
     return prompt
+
+QUERYGENERATE_PROMPT = """
+Task: You are asked to act as a human annotator. Your role is to generate 2 specific, open-ended questions based on the provided context.
+Each question should aim to extract or clarify key information from the context, focusing on a single aspect or detail.
+The questions must be directly related to the context to form a query-positive pair, suitable for use in constructing a retrieval dataset.
+---
+Requirements:
+1. Questions should be based on the keywords, such as phrases at the beginning, phrases before colon, and recurring phrases in the context.
+2. Use the terms in the context instead of pronouns.
+---
+Desired format:
+1. <question_1>
+2. <question_2>
+---
+Context:
+### {context}
+---
+Generated questions:
+"""
+
+TRUTHGENERATE_PROMPT = """
+Task: You are asked to act as a human annotator. Your role is to generate the right answer based on the context and question provided.
+Answers should aim to extract or clarify the key information of the question from the context, focusing on a single aspect or detail.
+The answer must be directly related to the context and the question, suitable for use in constructing a synthetic retrieval evaluation dataset.
+---
+Desired format:
+1. <ground_truth>
+---
+Question:
+### {question}
+---
+Context:
+### {context}
+---
+Generated ground_truth:
+"""
diff --git a/intel_extension_for_transformers/neural_chat/tests/nightly/tools/test_evaluation.py b/intel_extension_for_transformers/neural_chat/tests/nightly/tools/test_evaluation.py
@@ -0,0 +1,79 @@
+# !/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest, os, shutil
+from unittest.mock import patch
+from intel_extension_for_transformers.neural_chat.tools.evaluation.data_augmentation import retrieval_dataset_construction, llm_generate_truth
+from intel_extension_for_transformers.neural_chat.tools.evaluation.retriever import evaluate_retrieval
+
+
+class TestEvaluation(unittest.TestCase):
+    def setUp(self) -> None:
+        if os.path.exists("data.jsonl"):
+            os.remove("data.jsonl")
+        if os.path.exists("data_minedHN.jsonl"):
+            os.remove("data_minedHN.jsonl")
+        if os.path.exists("data_minedHN_split.jsonl"):
+            os.remove("data_minedHN_split.jsonl")
+        if os.path.exists("ground_truth.jsonl"):
+            os.remove("ground_truth.jsonl")
+        if os.path.exists("output"):
+            shutil.rmtree("output", ignore_errors=True)
+        return super().setUp()
+
+    def tearDown(self) -> None:
+        if os.path.exists("data.jsonl"):
+            os.remove("data.jsonl")
+        if os.path.exists("data_minedHN.jsonl"):
+            os.remove("data_minedHN.jsonl")
+        if os.path.exists("data_minedHN_split.jsonl"):
+            os.remove("data_minedHN_split.jsonl")
+        if os.path.exists("ground_truth.jsonl"):
+            os.remove("ground_truth.jsonl")
+        if os.path.exists("output"):
+            shutil.rmtree("output", ignore_errors=True)
+        return super().tearDown()
+
+    def test_retrieval_dataset_construction(self):
+        argv = ['--llm_model', '/tf_dataset2/models/nlp_toolkit/neural-chat-7b-v3-1', \
+                '--embedding_model', '/tf_dataset2/inc-ut/gte-base', \
+                '--input', '/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/assets/docs/retrieve_multi_doc/', \
+                '--output', 'data', \
+                '--range_for_sampling', '2-2', \
+                '--negative_number', '1']
+        with patch('sys.argv', ['python retrieval_dataset_construction.py'] + argv):
+            retrieval_dataset_construction.main()
+            self.assertTrue(os.path.exists("data_minedHN_split.jsonl"))
+
+    def test_llm_generate_truth(self):
+        argv = ['--llm_model', '/tf_dataset2/models/nlp_toolkit/neural-chat-7b-v3-1', \
+                '--input', '/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/example.jsonl', \
+                '--output', 'ground_truth.jsonl']
+        with patch('sys.argv', ['python llm_generate_truth.py'] + argv):
+            llm_generate_truth.main()
+            self.assertTrue(os.path.exists("ground_truth.jsonl"))
+
+    def test_evaluate_retrieval(self):
+        argv = ['--index_file_jsonl_path', '/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/candidate_context.jsonl', \
+                '--query_file_jsonl_path', '/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/example.jsonl', \
+                '--embedding_model', '/tf_dataset2/inc-ut/gte-base']
+        with patch('sys.argv', ['python evaluate_retrieval.py'] + argv):
+            result = evaluate_retrieval.main()
+            self.assertIsNotNone(result)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/intel_extension_for_transformers/neural_chat/tests/requirements.txt b/intel_extension_for_transformers/neural_chat/tests/requirements.txt
@@ -34,6 +34,7 @@ langchain_core==0.1.18
 langid
 librosa
 markdown
+modelscope
 neural-compressor
 neural_speed
 num2words

diff --git a/intel_extension_for_transformers/neural_chat/tools/evaluation/__init__.py b/intel_extension_for_transformers/neural_chat/tools/evaluation/__init__.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/...nsion_for_transformers/neural_chat/tools/evaluation/data_augmentation/README.md b/...nsion_for_transformers/neural_chat/tools/evaluation/data_augmentation/README.md
@@ -0,0 +1,114 @@
+# Retrieval Data Augmentation
+
+## 1. Introduction
+In this example, we show how to do data augmentation to construct a retrieval dataset. 
+
+* **Context to Question and Mine Hard Negatives**
+The effect is to generate several specific open-ended questions based on the context of the input file provided. The questions are directly related to the context to form a query-positive pair, suitable for use in constructing a retrieval dataset. Then we sample some from the entire corpus as the negatives by mining hard negatives, which is a widely used method to improve the quality of finetuning sentence embedding models.
+
+* **Context, Question to Ground Truth**
+The effect is to generate the right answer based on the context and question provided. The answer is directly related to the context and the question, suitable for use in constructing a synthetic retrieval evaluation dataset.
+
+## 2. Supported Devices
+CPU, CUDA
+
+## 3. Requirements
+```
+git clone https://github.com/intel/intel-extension-for-transformers.git
+cd intel-extension-for-transformers/intel_extension_for_transformers/neural_chat
+pip install -r requirements.txt
+cd pipeline/plugins/retrieval
+pip install -r requirements.txt
+```
+
+* **On CPU**
+```
+cd intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation
+pip install -r requirements_cpu.txt
+```
+
+* **On CUDA**
+```
+cd intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation
+pip install -r requirements_cuda.txt
+```
+
+## 4. Retrieval Dataset Construction
+### Context to Questions and Mine Hard Negatives
+* **On CPU**
+```
+cd intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/tools/evaluation
+python -m data_augmentation.retrieval_dataset_construction \
+--llm_model <llm model path> \
+--embedding_model <embedding model path> \
+--input <your input file path>
+```
+
+* **On CUDA**
+```
+cd intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/tools/evaluation
+python -m data_augmentation.retrieval_dataset_construction \
+--llm_model <llm model path> \
+--embedding_model <embedding model path> \
+--input <your input file path> \
+--use_gpu_for_searching True
+```
+
+**Some Important Arguments**:
+- `llm_model`: The path for the LLM model.
+- `embedding_model`: The path for the text embedding model.
+- `input`: The path of the file/folder/link of the content.
+- `output`: The name of output files. The default value is 'data'. The default output files are 'data.jsonl', 'data_minedHN.jsonl', 'data_minedHN_split.jsonl'.
+- `temperature`: The value is used to modulate the next token probabilities, and will influence the distribution of similarity scores. The default value is 0.8.
+- `top_p`: If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. The default value is 0.9.
+- `top_k`: The number of highest probability vocabulary tokens to keep for top-k-filtering. The default value is 40.
+- `repetition_penalty`: The parameter for repetition penalty. 1.0 means no penalty. The default value is 2.0.
+- `max_new_tokens`: The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. The default value is 48.
+- `do_sample`: Whether or not to use sampling ; use greedy decoding otherwise. The default value is True.
+- `num_beams`: Number of beams for beam search. 1 means no beam search. The default value is 2.
+- `num_return_sequences`: The number of independently computed returned sequences for each element in the batch. The default value is 2.
+- `use_cache`: Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. The default value is True.
+- `range_for_sampling`: The range to sample negatives. For example, `2-100` means sampling `negative_number` negatives from top2-top200 documents. You can set a larger value to reduce the difficulty of negatives (e.g., set it `60-300` to sample negatives from top60-300 passages). The default value is '2-10'.
+- `negative_number`: The number of sampled negatives. The default value is 5.
+- `use_gpu_for_searching`: Whether to use faiss-gpu to retrieve negatives. The default value is False.
+- `similarity_threshold`: The cosine similarity threshold used to filter the generated queries. The default value is 0.6.
+
+**Result**:
+Three files will be generated. The default output files are `data.jsonl`, `data_minedHN.jsonl`, `data_minedHN_split.jsonl`. The third is the final output dataset, where each line is a dict like this:
+```
+{"query": str, "pos": List[str], "neg": List[str]}
+```
+`query` is the query, and `pos` is a positive text, based on the context of the input file provided, `neg` is a list of negative texts.
+See [augmented_example.jsonl](https://github.com/intel/intel-extension-for-transformers/blob/master/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/augmented_example.jsonl) for a data file.
+
+
+### Context, Question to Ground Truth
+```
+cd intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation
+python llm_generate_truth.py \
+--llm_model <llm model path> \
+--input example.jsonl \
+--output ground_truth.jsonl
+```
+
+**Some Important Arguments**:
+- `llm_model`: The path for the LLM model.
+- `input`: The path of JSON data including queries and positives where each line is a dict like this:```{"query": str, "pos": List[str]}```. See [example.jsonl](https://github.com/intel/intel-extension-for-transformers/blob/master/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/example.jsonl) for a data file.
+- `output`: The path of the output JSON data.
+- `temperature`: The value is used to modulate the next token probabilities, and will influence the distribution of similarity scores. The default value is 0.8.
+- `top_p`: If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. The default value is 0.9.
+- `top_k`: The number of highest probability vocabulary tokens to keep for top-k-filtering. The default value is 40.
+- `repetition_penalty`: The parameter for repetition penalty. 1.0 means no penalty. The default value is 2.0.
+- `max_new_tokens`: The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. The default value is 48.
+- `do_sample`: Whether or not to use sampling ; use greedy decoding otherwise. The default value is True.
+- `num_beams`: Number of beams for beam search. 1 means no beam search. The default value is 2.
+- `num_return_sequences`: The number of independently computed returned sequences for each element in the batch. The default value is 2.
+- `use_cache`: Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. The default value is True.
+
+**Result**:
+Each line of the output JSON data is a dict like this:
+```
+{"question": str, "context": List[str], "ground_truth": str}
+```
+`ground_truth` is the generated ground truth, based on the question and context provided.
+See [ground_truth.jsonl](https://github.com/intel/intel-extension-for-transformers/blob/master/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/ground_truth.jsonl) for a data file.
diff --git a/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/__init__.py b/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/__init__.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/answer.jsonl b/intel_extension_for_transformers/neural_chat/tools/evaluation/data_augmentation/answer.jsonl
@@ -0,0 +1,10 @@
+{"question": "What types of platforms does the organization focus on?", "answer": "The organization focuses on delivering open software and hardware platforms with industry-defining standards, as well as leadership products, open and secure platforms, and resilient manufacturing."}
+{"question": "What are the core values that drive our company's actions?", "answer": "The core values driving the company's actions include focusing on having a positive impact on business, society, and the planet by working together with talented individuals. They also emphasize delivering leadership products, open and secure platforms, and resilient manufacturing to support global digitalization and ensure customer success."}
+{"question": "What types of companies does Intel invest in?", "answer": "Intel invests in public and private companies."}
+{"question": "How has technology been central to our lives in recent years?", "answer": "In recent years, technology has become more essential as it permeates various aspects of our daily lives. This includes advancements in communication, entertainment, transportation, healthcare, and many other sectors. All these rely heavily on semiconductors, which play a crucial role in powering and enabling these technologies."}
+{"question": "What is Intel's focus in terms of delivering leadership products?", "answer": "Intel's focus in terms of delivering leadership products includes providing open and secure platforms as well as resilient manufacturing for enabling global digitalization and fueling customer success."}
+{"question": "How has Intel been affected by the COVID-19 pandemic so far, and what?", "answer": "Intel has not provided specific details on how they have been directly affected by the COVID-19 pandemic. However, it can be inferred that like many other companies, they might have experienced challenges related to supply chain disruptions, workforce adjustments, and potential changes in demand for their products due to the global economic impact of the pandemic."}
+{"question": "How does the company protect personal data to prevent unauthorized access or misuse?", "answer": "The text provided doesn't specifically mention how the company protects personal data to prevent unauthorized access or misuse. However, it highlights the potential consequences of such incidents, which might imply that they have measures in place to minimize these risks."}
+{"question": "What are the conditions for accessing third-party IP?", "answer": "The conditions for accessing third-party IP can vary depending on the specific agreement between the parties involved. However, generally, it includes ensuring availability on commercially reasonable terms or at all."}
+{"question": "How many customers contribute to the majority of our revenue?", "answer": "A limited number of customers contribute to the majority of your revenue."}
+{"question": "When does Intel plan to deliver on its goal of five manufacturing technology nodes in four years?", "answer": "Intel remains on track to deliver on this goal within four years."}