|
| 1 | +#!/usr/bin/env python |
| 2 | +# encoding: utf-8 |
| 3 | +# Licensed to the Apache Software Foundation (ASF) under one or more |
| 4 | +# contributor license agreements. See the NOTICE file distributed with |
| 5 | +# this work for additional information regarding copyright ownership. |
| 6 | +# The ASF licenses this file to You under the Apache License, Version 2.0 |
| 7 | +# (the "License"); you may not use this file except in compliance with |
| 8 | +# the License. You may obtain a copy of the License at |
| 9 | +# |
| 10 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +# |
| 12 | +# Unless required by applicable law or agreed to in writing, software |
| 13 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | +# See the License for the specific language governing permissions and |
| 16 | +# limitations under the License. |
| 17 | +# |
| 18 | + |
| 19 | +import .tika.parser |
| 20 | +from io import StringIO |
| 21 | +from bs4 import BeautifulSoup |
| 22 | + |
| 23 | +def text_from_pdf_pages(filename): |
| 24 | + pages_txt = [] |
| 25 | + |
| 26 | + # Read PDF file |
| 27 | + data = parser.from_file(filename, xmlContent=True) |
| 28 | + xhtml_data = BeautifulSoup(data['content']) |
| 29 | + for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})): |
| 30 | + # Parse PDF data using TIKA (xml/html) |
| 31 | + # It's faster and safer to create a new buffer than truncating it |
| 32 | + # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object |
| 33 | + _buffer = StringIO() |
| 34 | + _buffer.write(str(content)) |
| 35 | + parsed_content = parser.from_buffer(_buffer.getvalue()) |
| 36 | + |
| 37 | + # Add pages |
| 38 | + text = parsed_content['content'].strip() |
| 39 | + pages_txt.append(text) |
| 40 | + |
| 41 | + return pages_txt |
0 commit comments