-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathchunk_llamaindex.py
More file actions
87 lines (70 loc) · 2.83 KB
/
chunk_llamaindex.py
File metadata and controls
87 lines (70 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
import json
import os
import time
# Llama_index
# root_dir = 'db_qa_tmp'
# all_data=[]
# for root, dirs, files in os.walk(root_dir):
# for file in files:
# documents = SimpleDirectoryReader(input_files=[os.path.join(root, file)]).load_data()
# node_parser = SimpleNodeParser.from_defaults(
# chunk_size=196, chunk_overlap=0)
# nodes_tmp = node_parser.get_nodes_from_documents(documents, show_progress=True)
# nodes=[]
# for node in nodes_tmp:
# i=node.text
# nodes.append(i)
# with open(os.path.join(root, file), 'r', encoding='utf-8') as cfile:
# content = cfile.read()
# save = {}
# save['raw_corpus'] = content
# save['final_chunks'] = nodes
# all_data.append(save)
# with open('chunk_llamaindex.json', 'w', encoding='utf-8') as sfile:
# json.dump(all_data, sfile, ensure_ascii=False, indent=4)
# # nohup python chunk_llamaindex.py &
# # Original Chunking
# root_dir = 'db_qa_tmp'
# all_data=[]
# for root, dirs, files in os.walk(root_dir):
# for file in files:
# chunk_size=
# with open(os.path.join(root, file), 'r', encoding='utf-8') as file:
#
# content = file.read()
# chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
# save = {}
# save['raw_corpus'] = content
# save['final_chunks'] = chunks
# all_data.append(save)
# with open('chunk_original.json', 'w', encoding='utf-8') as sfile:
# json.dump(all_data, sfile, ensure_ascii=False, indent=4)
# # nohup python chunk_llamaindex.py &
# Semantic Chunking
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import json
import time
start_time = time.time()
save_path=''
split_path=''
documents = SimpleDirectoryReader(split_path).load_data()
embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-base-zh-v1.5")
splitter = SemanticSplitterNodeParser(
buffer_size=1, breakpoint_percentile_threshold=68, embed_model=embed_model
)
all_data=[]
for item in documents:
nodes = splitter.get_nodes_from_documents([item], show_progress=True)
for node in nodes:
if node.text.strip() !='':
all_data.append(node.text)
with open(save_path, 'w', encoding='utf-8') as sfile:
json.dump(all_data, sfile, ensure_ascii=False, indent=4)
end_time = time.time()
execution_time = end_time - start_time
print(f"The program execution time is: {execution_time} s.")
# CUDA_VISIBLE_DEVICES=1 nohup python chunk_llamaindex.py &