diff --git a/PDF to Markdown/PDFtoMD.py b/PDF to Markdown/PDFtoMD.py new file mode 100644 index 00000000..9c9ea5b3 --- /dev/null +++ b/PDF to Markdown/PDFtoMD.py @@ -0,0 +1,112 @@ +import pytesseract +from PIL import Image +from markitdown import MarkItDown +import pdf2image +import os +import sys + +# Configure tesseract path +pytesseract.pytesseract.tesseract_cmd = r'D:\Tesseract OCR\tesseract.exe' +poppler_path = r'D:\Release-25.07.0-0\poppler-25.07.0\Library\bin' + +def detect_type(pdf_file): + try: + md = MarkItDown() + result = md.convert(pdf_file) + + if len(result.text_content.strip()) < 50: + return "image" + else: + return "text" + except Exception as e: + print(f"Error detecting PDF type: {e}") + return "image" + +def convert_text_pdf(pdf_file, output_folder): + try: + md = MarkItDown() + result = md.convert(pdf_file) + + md_filename = os.path.basename(pdf_file).replace('.pdf', '.md') + output_path = os.path.join(output_folder, md_filename) + + with open(output_path, "w", encoding="utf-8") as f: + f.write(result.text_content) + + print(f"Converted {pdf_file} -> {output_path}") + return True + except Exception as e: + print(f"Error converting {pdf_file}: {e}") + return False + +def convert_image_pdf(pdf_file, output_folder): + try: + pages = pdf2image.convert_from_path(pdf_file, dpi=300, poppler_path = poppler_path) + + pdf_name = os.path.basename(pdf_file) + all_text = f"# {pdf_name}\n\n" + + for i, page in enumerate(pages): + page = page.convert('L') + + text = pytesseract.image_to_string( + page, + lang='vie', + config='--oem 3 --psm 6' + ) + + if text.strip(): + all_text += f'## Trang {i+1}\n\n{text}\n\n' + + md_filename = os.path.basename(pdf_file).replace('.pdf', '.md') + output_path = os.path.join(output_folder, md_filename) + + with open(output_path, "w", encoding="utf-8") as f: + f.write(all_text) + + print(f"Converted {pdf_file} -> {output_path}") + return True + except Exception as e: + print(f"Error converting {pdf_file}: {e}") + return False + +def smart_convert_pdf(pdf_file, output_folder=r"D:\PythonProject\ToMD\Output"): + os.makedirs(output_folder, exist_ok=True) + + pdf_type = detect_type(pdf_file) + print(f"Detected PDF type: {pdf_type}") + + if pdf_type == "text": + return convert_text_pdf(pdf_file, output_folder) + else: + return convert_image_pdf(pdf_file, output_folder) + +def main(): + data_folder = r'D:\PythonProject\ToMD\Data' + + if not os.path.exists(data_folder): + print(f"Data folder not found: {data_folder}") + return + + pdf_files = [f for f in os.listdir(data_folder) if f.lower().endswith('.pdf')] + + if not pdf_files: + print("No PDF files found in the data folder") + return + + print(f"Found {len(pdf_files)} PDF files") + + for pdf_file in pdf_files: + full_path = os.path.join(data_folder, pdf_file) + print(f"\nProcessing: {pdf_file}") + + success = smart_convert_pdf(full_path) + if success: + print("Conversion successful") + else: + print("Conversion failed") + + print("-" * 50) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/PDF to Markdown/README.md b/PDF to Markdown/README.md new file mode 100644 index 00000000..cab6fec9 --- /dev/null +++ b/PDF to Markdown/README.md @@ -0,0 +1,60 @@ +# PDF to Markdown Converter + +A Python script that intelligently converts PDF files to Markdown format. The script automatically detects whether a PDF is text-based or image-based and applies the appropriate conversion method. + +## Features + +- **Smart Detection**: Automatically identifies if a PDF contains text or images +- **Text PDF Conversion**: Extracts text directly from text-based PDFs +- **Image PDF Conversion**: Uses OCR (Optical Character Recognition) to extract text from image-based PDFs +- **Batch Processing**: Converts multiple PDF files at once +- **Vietnamese Language Support**: Includes Vietnamese OCR support + +## Requirements + +- Python 3.x +- pytesseract +- Pillow (PIL) +- markitdown +- pdf2image +- Tesseract OCR (installed separately) +- Poppler (installed separately) + +## Installation + +1. Install Python dependencies: +```bash +pip install pytesseract Pillow markitdown pdf2image +``` + +2. Install Tesseract OCR from [https://github.com/tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract) + +3. Install Poppler from [https://github.com/oschwartz10612/poppler-windows/releases](https://github.com/oschwartz10612/poppler-windows/releases) + +4. Update the paths in the script: + - `tesseract_cmd`: Path to your Tesseract executable + - `poppler_path`: Path to your Poppler bin folder + - `data_folder`: Folder containing your PDF files + - `output_folder`: Folder where Markdown files will be saved + +## Usage + +1. Place your PDF files in the data folder +2. Run the script: +```bash +python PDFtoMD.py +``` + +3. Find converted Markdown files in the output folder + +## How It Works + +1. The script scans the data folder for PDF files +2. For each PDF, it detects whether it's text-based or image-based +3. Text PDFs are converted directly using MarkItDown +4. Image PDFs are converted using OCR with pytesseract +5. Output is saved as `.md` files with the same name as the input PDF + +## License + +This project is open source and available under the MIT License.