-
Notifications
You must be signed in to change notification settings - Fork 70
Open
Description
不论是win11的python环境还是wsl ubuntu均报错:
...: parser = EpubParser(engine="unstructured") # epub engines: unstructured
...: epub_data = parser.parse(epub_path)
...:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[1], line 4
2 epub_path = "xxx.epub"
3 parser = EpubParser(engine="unstructured") # epub engines: unstructured
----> 4 epub_data = parser.parse(epub_path)
File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/wisup_e2m/parsers/doc/epub_parser.py:165, in EpubParser.parse(self, file_name, file, extract_images, include_image_link_in_text, ignore_transparent_images, work_dir, image_dir, relative_path, **kwargs)
163 if k in _epub_parser_params:
164 kwargs[k] = v
--> 165 return self.get_parsed_data(**kwargs)
File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/wisup_e2m/parsers/doc/epub_parser.py:132, in EpubParser.get_parsed_data(self, file_name, file, extract_images, include_image_link_in_text, ignore_transparent_images, work_dir, image_dir, relative_path, **kwargs)
129 EpubParser._validate_input_file(file_name)
131 if self.config.engine == "unstructured":
--> 132 return self._parse_by_unstructured(
133 file_name=file_name,
134 file=file,
135 extract_images=extract_images,
136 include_image_link_in_text=include_image_link_in_text,
137 ignore_transparent_images=ignore_transparent_images,
138 work_dir=work_dir,
139 image_dir=image_dir,
140 relative_path=relative_path,
141 )
142 else:
143 raise NotImplementedError(f"Engine {self.config.engine} not supported")
File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/wisup_e2m/parsers/doc/epub_parser.py:84, in EpubParser._parse_by_unstructured(self, file_name, file, extract_images, include_image_link_in_text, ignore_transparent_images, work_dir, image_dir, relative_path)
79 logger.info(f"Parsing {file_name} using unstructured engine...")
81 import unstructured
83 unstructured_elements: List[unstructured.documents.elements.Element] = (
---> 84 self.unstructured_parse_func(
85 filename=file_name,
86 file=file,
87 languages=self.config.langs,
88 )
89 )
91 if extract_images:
92 epub_images = get_epub_images(
93 file_name=file_name,
94 file=file,
95 target_image_dir=image_dir,
96 ignore_transparent_images=ignore_transparent_images,
97 )
File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/unstructured/partition/epub.py:48, in partition_epub(filename, file, metadata_filename, metadata_last_modified, languages, detect_language_per_element, **kwargs)
44 exactly_one(filename=filename, file=file)
46 last_modified = get_last_modified_date(filename) if filename else None
---> 48 html_text = convert_file_to_html_text_using_pandoc(
49 source_format="epub", filename=filename, file=file
50 )
52 return partition_html(
53 text=html_text,
54 encoding="unicode",
(...)
61 **kwargs,
62 )
File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/unstructured/file_utils/file_conversion.py:67, in convert_file_to_html_text_using_pandoc(source_format, filename, file)
62 return convert_file_to_text(
63 filename=tmp_file_path, source_format=source_format, target_format="html"
64 )
66 assert filename is not None
---> 67 return convert_file_to_text(
68 filename=filename, source_format=source_format, target_format="html"
69 )
File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/unstructured/utils.py:217, in requires_dependencies.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
214 @wraps(func)
215 def wrapper(*args: _P.args, **kwargs: _P.kwargs):
216 run_check()
--> 217 return func(*args, **kwargs)
File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/unstructured/file_utils/file_conversion.py:17, in convert_file_to_text(filename, source_format, target_format)
14 import pypandoc
16 try:
---> 17 text = pypandoc.convert_file(filename, target_format, format=source_format)
18 except FileNotFoundError as err:
19 msg = (
20 f"Error converting the file to text. Ensure you have the pandoc package installed on"
21 f" your system. Installation instructions are available at"
22 f" https://pandoc.org/installing.html. The original exception text was:\n{err}"
23 )
File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/pypandoc/__init__.py:206, in convert_file(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir, sort_files)
203 if len(discovered_source_files) == 1:
204 discovered_source_files = discovered_source_files[0]
--> 206 return _convert_input(discovered_source_files, format, 'path', to, extra_args=extra_args,
207 outputfile=outputfile, filters=filters,
208 verify_format=verify_format, sandbox=sandbox,
209 cworkdir=cworkdir, sort_files=sort_files)
File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/pypandoc/__init__.py:387, in _convert_input(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir, sort_files)
384 input_file = []
386 if sort_files:
--> 387 input_file = sorted(input_file)
389 args = [__pandoc_path, '--from=' + format]
391 args.append('--to=' + to)
TypeError: 'PosixPath' object is not iterable
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels