MoM/chunk_llamaindex.py at main · MemTensor/MoM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader
from llama_index.node_parser import SimpleNodeParser
import json
import os
import time

# Llama_index
# root_dir = 'db_qa_tmp'
# all_data=[]
# for root, dirs, files in os.walk(root_dir):
#     for file in files:
#         documents = SimpleDirectoryReader(input_files=[os.path.join(root, file)]).load_data()
#         node_parser = SimpleNodeParser.from_defaults(
#             chunk_size=196, chunk_overlap=0)
#         nodes_tmp = node_parser.get_nodes_from_documents(documents, show_progress=True)

#         nodes=[]
#         for node in nodes_tmp:
#             i=node.text
#             nodes.append(i)

#         with open(os.path.join(root, file), 'r', encoding='utf-8') as cfile:
#             content = cfile.read()
#         save = {}
#         save['raw_corpus'] = content
#         save['final_chunks'] = nodes
#         all_data.append(save)

# with open('chunk_llamaindex.json', 'w', encoding='utf-8') as sfile:
#     json.dump(all_data, sfile, ensure_ascii=False, indent=4)

# # nohup python chunk_llamaindex.py &


# # Original Chunking
# root_dir = 'db_qa_tmp'
# all_data=[]
# for root, dirs, files in os.walk(root_dir):
#     for file in files:
#         chunk_size=
#         with open(os.path.join(root, file), 'r', encoding='utf-8') as file:
#
#             content = file.read()
#         chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
#         save = {}
#         save['raw_corpus'] = content
#         save['final_chunks'] = chunks
#         all_data.append(save)

# with open('chunk_original.json', 'w', encoding='utf-8') as sfile:
#     json.dump(all_data, sfile, ensure_ascii=False, indent=4)

# # nohup python chunk_llamaindex.py &


# Semantic Chunking
from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import json
import time

start_time = time.time()
save_path=''
split_path=''
documents = SimpleDirectoryReader(split_path).load_data()

embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-base-zh-v1.5")
splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=68, embed_model=embed_model
)

all_data=[]
for item in documents:
    nodes = splitter.get_nodes_from_documents([item], show_progress=True)

    for node in nodes:
        if node.text.strip() !='':
            all_data.append(node.text)

with open(save_path, 'w', encoding='utf-8') as sfile:
    json.dump(all_data, sfile, ensure_ascii=False, indent=4)
end_time = time.time()
execution_time = end_time - start_time
print(f"The program execution time is: {execution_time} s.")

# CUDA_VISIBLE_DEVICES=1 nohup python chunk_llamaindex.py &