diff --git a/requirements.txt b/requirements.txt index 88e2f35..443d114 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,6 @@ pydantic camelot-py[base] opencv-python pymupdf_layout -xgboost \ No newline at end of file +xgboost +opencv-python +pyspellchecker \ No newline at end of file diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index a51ee2c..7f7d8ad 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -19,10 +19,30 @@ import json from typing import List, Dict import camelot +import cv2 +import numpy as np +from spellchecker import SpellChecker Image.MAX_IMAGE_PIXELS = None fitz.TOOLS.mupdf_display_errors(False) +# Maximum allowed ratio of misspelled words to total words in a pdf +MAX_SPELLING_ERROR_RATE = 0.05 + + +def check_spelling(text: str) -> float: + """ + Returns ratio of mispelled words to total words + + Returns 1 if no words detected in input string + """ + spellChecker = SpellChecker() + words = spellChecker.split_words(text) + if len(words) == 0: + return 1 + misspelled = spellChecker.unknown(words) + return len(misspelled) / len(words) + def extract_tables_with_camelot(pdf_path: str) -> List[Dict]: """Extract tables using camelot-py (fallback method). @@ -138,30 +158,54 @@ def extract_tables_from_pdf(pdf_path: str) -> List[Dict]: return tables_data +def parse_page_ocr(page: fitz.Page) -> str: + """Extracts text from page using OCR""" + pix = page.get_pixmap(dpi=300) + img = Image.open(io.BytesIO(pix.tobytes("png"))) + img_array = np.array(img) + img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY) + img_array = cv2.fastNlMeansDenoising(img_array, h=10, templateWindowSize=7, searchWindowSize=21) + page_text = pytesseract.image_to_string(img_array) + return page_text + + +def parse_page_embedded(page: fitz.Page) -> str: + """Extracts text embedded in a PDF page""" + # Extract text from the page using PyMuPDF + page_text = page.get_text("text") + + # Clean out null bytes or UTF-16 artifacts + if "\x00" in page_text: + page_text = page_text.replace("\x00", "") + + return page_text + + def extract_text_from_pdf(pdf_path: str) -> str: text = [] try: with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc, start=1): - # Extract text from the page using PyMuPDF - page_text = page.get_text("text") - - # Clean out null bytes or UTF-16 artifacts - if "\x00" in page_text: - page_text = page_text.replace("\x00", "") - - # If the page is mostly empty, treat as image and use OCR - if not page_text.strip(): - pix = page.get_pixmap(dpi=300) - img = Image.open(io.BytesIO(pix.tobytes("png"))) - page_text = pytesseract.image_to_string(img) - - text.append(page_text) + # try to extract embedded text first + text.append(parse_page_embedded(page)) + text = "\n".join(text) except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" - return "\n".join(text) + # If there are too many errors in the file + if check_spelling(text) > MAX_SPELLING_ERROR_RATE: + text = [] + try: + with fitz.open(pdf_path) as doc: + for page_num, page in enumerate(doc, start=1): + # extract page text with OCR + text.append(parse_page_ocr(page)) + text = "\n".join(text) + except Exception as e: + print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) + return "" + return text def extract_text_from_pdf_bytes(data: bytes) -> str: @@ -170,18 +214,24 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: try: with fitz.open(stream=data, filetype="pdf") as doc: for page_num, page in enumerate(doc, start=1): - page_text = page.get_text("text") - if "\x00" in page_text: - page_text = page_text.replace("\x00", "") - if not page_text.strip(): - pix = page.get_pixmap(dpi=300) - img = Image.open(io.BytesIO(pix.tobytes("png"))) - page_text = pytesseract.image_to_string(img) - text.append(page_text) + # try to extract embedded text first + text.append(parse_page_embedded(page)) + text = "\n".join(text) except Exception as e: print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) return "" - return "\n".join(text) + + if check_spelling(text) > MAX_SPELLING_ERROR_RATE: + text = [] + try: + with fitz.open(stream=data, filetype="pdf") as doc: + for page_num, page in enumerate(doc, start=1): + text.append(parse_page_ocr(page)) + text = "\n".join(text) + except Exception as e: + print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) + return "" + return text def save_to_file(text: str, output_path: str):