Skip to content

win11下 demo 代码运行报错 #16

@yunchenran

Description

@yunchenran

不论是win11的python环境还是wsl ubuntu均报错:

   ...: parser = EpubParser(engine="unstructured") # epub engines: unstructured
   ...: epub_data = parser.parse(epub_path)
   ...:
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[1], line 4
      2 epub_path = "xxx.epub"
      3 parser = EpubParser(engine="unstructured") # epub engines: unstructured
----> 4 epub_data = parser.parse(epub_path)

File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/wisup_e2m/parsers/doc/epub_parser.py:165, in EpubParser.parse(self, file_name, file, extract_images, include_image_link_in_text, ignore_transparent_images, work_dir, image_dir, relative_path, **kwargs)
    163     if k in _epub_parser_params:
    164         kwargs[k] = v
--> 165 return self.get_parsed_data(**kwargs)

File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/wisup_e2m/parsers/doc/epub_parser.py:132, in EpubParser.get_parsed_data(self, file_name, file, extract_images, include_image_link_in_text, ignore_transparent_images, work_dir, image_dir, relative_path, **kwargs)
    129     EpubParser._validate_input_file(file_name)
    131 if self.config.engine == "unstructured":
--> 132     return self._parse_by_unstructured(
    133         file_name=file_name,
    134         file=file,
    135         extract_images=extract_images,
    136         include_image_link_in_text=include_image_link_in_text,
    137         ignore_transparent_images=ignore_transparent_images,
    138         work_dir=work_dir,
    139         image_dir=image_dir,
    140         relative_path=relative_path,
    141     )
    142 else:
    143     raise NotImplementedError(f"Engine {self.config.engine} not supported")

File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/wisup_e2m/parsers/doc/epub_parser.py:84, in EpubParser._parse_by_unstructured(self, file_name, file, extract_images, include_image_link_in_text, ignore_transparent_images, work_dir, image_dir, relative_path)
     79 logger.info(f"Parsing {file_name} using unstructured engine...")
     81 import unstructured
     83 unstructured_elements: List[unstructured.documents.elements.Element] = (
---> 84     self.unstructured_parse_func(
     85         filename=file_name,
     86         file=file,
     87         languages=self.config.langs,
     88     )
     89 )
     91 if extract_images:
     92     epub_images = get_epub_images(
     93         file_name=file_name,
     94         file=file,
     95         target_image_dir=image_dir,
     96         ignore_transparent_images=ignore_transparent_images,
     97     )

File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/unstructured/partition/epub.py:48, in partition_epub(filename, file, metadata_filename, metadata_last_modified, languages, detect_language_per_element, **kwargs)
     44 exactly_one(filename=filename, file=file)
     46 last_modified = get_last_modified_date(filename) if filename else None
---> 48 html_text = convert_file_to_html_text_using_pandoc(
     49     source_format="epub", filename=filename, file=file
     50 )
     52 return partition_html(
     53     text=html_text,
     54     encoding="unicode",
   (...)
     61     **kwargs,
     62 )

File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/unstructured/file_utils/file_conversion.py:67, in convert_file_to_html_text_using_pandoc(source_format, filename, file)
     62         return convert_file_to_text(
     63             filename=tmp_file_path, source_format=source_format, target_format="html"
     64         )
     66 assert filename is not None
---> 67 return convert_file_to_text(
     68     filename=filename, source_format=source_format, target_format="html"
     69 )

File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/unstructured/utils.py:217, in requires_dependencies.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    214 @wraps(func)
    215 def wrapper(*args: _P.args, **kwargs: _P.kwargs):
    216     run_check()
--> 217     return func(*args, **kwargs)

File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/unstructured/file_utils/file_conversion.py:17, in convert_file_to_text(filename, source_format, target_format)
     14 import pypandoc
     16 try:
---> 17     text = pypandoc.convert_file(filename, target_format, format=source_format)
     18 except FileNotFoundError as err:
     19     msg = (
     20         f"Error converting the file to text. Ensure you have the pandoc package installed on"
     21         f" your system. Installation instructions are available at"
     22         f" https://pandoc.org/installing.html. The original exception text was:\n{err}"
     23     )

File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/pypandoc/__init__.py:206, in convert_file(source_file, to, format, extra_args, encoding, outputfile, filters, verify_format, sandbox, cworkdir, sort_files)
    203 if len(discovered_source_files) == 1:
    204     discovered_source_files = discovered_source_files[0]
--> 206 return _convert_input(discovered_source_files, format, 'path', to, extra_args=extra_args,
    207                   outputfile=outputfile, filters=filters,
    208                   verify_format=verify_format, sandbox=sandbox,
    209                   cworkdir=cworkdir, sort_files=sort_files)

File ~/miniconda3/envs/e2m/lib/python3.10/site-packages/pypandoc/__init__.py:387, in _convert_input(source, format, input_type, to, extra_args, outputfile, filters, verify_format, sandbox, cworkdir, sort_files)
    384     input_file = []
    386 if sort_files:
--> 387     input_file = sorted(input_file)
    389 args = [__pandoc_path, '--from=' + format]
    391 args.append('--to=' + to)

TypeError: 'PosixPath' object is not iterable

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions