Fix for: Upgrade to latest Tika: 3.1.0 #423

chrismattmann · chrismattmann · commit 5024e2a0c370 · 2025-03-22T15:05:55.000-07:00
diff --git a/tika/__init__.py b/tika/__init__.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.6.1"
+__version__ = "3.1.0"
 
 try:
     __import__('pkg_resources').declare_namespace(__name__)
diff --git a/tika/pdf.py b/tika/pdf.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# encoding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import .tika.parser
+from io import StringIO
+from bs4 import BeautifulSoup
+
+def text_from_pdf_pages(filename):
+    pages_txt = []
+
+    # Read PDF file
+    data = parser.from_file(filename, xmlContent=True)
+    xhtml_data = BeautifulSoup(data['content'])
+    for i, content in enumerate(xhtml_data.find_all('div', attrs={'class': 'page'})):
+        # Parse PDF data using TIKA (xml/html)
+        # It's faster and safer to create a new buffer than truncating it
+        # https://stackoverflow.com/questions/4330812/how-do-i-clear-a-stringio-object
+        _buffer = StringIO()
+        _buffer.write(str(content))
+        parsed_content = parser.from_buffer(_buffer.getvalue())
+
+        # Add pages
+        text = parsed_content['content'].strip()
+        pages_txt.append(text)
+
+    return pages_txt
diff --git a/tika/tika.py b/tika/tika.py
@@ -166,7 +166,7 @@ def make_content_disposition_header(fn):
 log.setLevel(logging.INFO)
 
 Windows = True if platform.system() == "Windows" else False
-TikaVersion = os.getenv('TIKA_VERSION', '2.6.0')
+TikaVersion = os.getenv('TIKA_VERSION', '3.1.0')
 TikaJarPath = os.getenv('TIKA_PATH', tempfile.gettempdir())
 TikaFilesPath = tempfile.gettempdir()
 TikaServerLogFilePath = log_path