-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathsetup.py
More file actions
121 lines (93 loc) · 4.17 KB
/
setup.py
File metadata and controls
121 lines (93 loc) · 4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from setuptools import find_packages, setup
DESCRIPTION = 'An Open source Pdf Parser Python library'
INCLUDE_FROM_PACKAGES = ["doc_parser", "doc_parser.*"]
VERSION = "0.1.6.8"
# 获取主版本号
# 输入0.1.5.5 输出0.1.5
def get_main_version(version):
return version.rsplit('.', 1)[0]
# read version number from version.txt, otherwise alpha version
# Github CI can create version.txt dynamically.
def get_version(fname):
if os.path.exists(fname):
with open(fname, "r", encoding="utf-8") as f:
version = f.readline().strip()
else:
version = VERSION
return version
# Load README.md for long description
def load_long_description(fname):
if os.path.exists(fname):
with open(fname, "r", encoding="utf-8") as f:
long_description = f.read()
else:
long_description = DESCRIPTION
return long_description
def load_requirements(fname):
"""纯文件读取,不依赖 pip"""
if not os.path.isfile(fname):
return [] # 文件不存在就当成空依赖
with open(fname, encoding="utf-8") as f:
lines = f.readlines()
reqs = []
for line in lines:
line = line.strip()
if line == "" or line.startswith("#"): # 去掉空行和注释
continue
# 简单处理 --find-links 等命令行参数(如果有)
if line.startswith("-"):
continue
reqs.append(line)
return reqs
if __name__ == "__main__":
setup(
name="bella-domify",
version=get_version("version.txt"),
keywords=["document-doc_parser"],
description=DESCRIPTION,
long_description=load_long_description("README.md"),
long_description_content_type="text/markdown",
license="GPL v3",
author=["tangxiaolong", "luxu", "zhangxiaojia"],
author_email="",
# 暂不写,后续换成github地址
url="",
packages=find_packages(include=INCLUDE_FROM_PACKAGES),
include_package_data=True,
zip_safe=False,
install_requires=load_requirements("requirements.txt"),
python_requires=">=3.6"
)
"""
发包命令:
python setup.py bdist_wheel upload -r ke
版本说明:
version = '0.1.1.0' 20240816 页眉新方案上线;
version = '0.1.2.0' 20240822 目录、封面识别并去除;接口解析接口图片附带S3链接;
version = '0.1.3.0' 20240823 封面识别前提大于3页(含)
version = '0.1.3.1' 20240826 parser日志输出优化
version = '0.1.3.2' 20240826 读取config强校验去除
version = '0.1.3.4' 20240829 修复特殊字符引起的解析结果打印异常;页脚阈值放宽;
version = '0.1.3.6' 20240830 不常见字体兼容;
version = '0.1.3.7' 20240912 FAQ文件QA切分输出;
version = '0.1.3.10' 20240919 大文件oom问题优化;
version = '0.1.3.11' 20240919 image_link属性丢失问题修复;
version = '0.1.3.12' 20240920 特殊字体的默认读入设置改为Helvetica;
version = '0.1.3.13' 20240923 block切分准确率:0.96;
version = '0.1.4.0' 20240923 block优化里程碑(同0.1.3.13);
version = '0.1.5.0' 20241022 层级优化里程碑0.80 (336 / 422);
version = '0.1.5.1' 20241023 bug修复;
version = '0.1.5.2' 20241023 bug修复;
version = '0.1.5.4' 20241024 目录过滤,默认值改为是;
version = '0.1.5.5' 20241104 页眉阈值优化;
version = '0.1.6.0' 20241126 block优化(跨列合并、单行文字字号不统一等)横版pdf忽略页眉识别;
version = '0.1.6.1' 20241206 bug修复,获取字体等统计量时兼容image;
version = '0.1.6.2' 20241209 依赖版本修改openpyxl==3.1.5;
version = '0.1.6.3' 20250113 优化FAQ判断逻辑(添加空页情况兼容);
version = '0.1.6.4' 20250220 添加图片信息提取功能,添加tomarkdown;
version = '0.1.6.7' 20250604 代码clear基本完成,组织结构进行调整
version = '0.1.6.8' 20250626 取消FAQDomtree类型输出,不再用模型提取FAQ类文件
"""