From 33cab3a10d615c5a320b991e849199d7eb1ea120 Mon Sep 17 00:00:00 2001 From: Bradley Rule Date: Sun, 18 Jan 2026 17:59:16 -0800 Subject: [PATCH 1/4] Added image denoising to OCR --- requirements.txt | 3 ++- src/preprocessing/pdf_text_extraction.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index b940946..92b8f54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ google-auth-oauthlib google-auth-httplib2 numpy google-api-python-client -xgboost \ No newline at end of file +xgboost +opencv-python \ No newline at end of file diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index bbd9a6b..48e7c49 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -16,6 +16,8 @@ import argparse from pathlib import Path import sys +import cv2 +import numpy as np Image.MAX_IMAGE_PIXELS = None fitz.TOOLS.mupdf_display_errors(False) @@ -38,8 +40,9 @@ def extract_text_from_pdf(pdf_path: str) -> str: if not page_text.strip(): pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) - page_text = pytesseract.image_to_string(img) - + img_array = np.array(img) + img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 15) + page_text = pytesseract.image_to_string(img_array) text.append(page_text) except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) @@ -60,7 +63,9 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: if not page_text.strip(): pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) - page_text = pytesseract.image_to_string(img) + img_array = np.array(img) + img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 15) + page_text = pytesseract.image_to_string(img_array) text.append(page_text) except Exception as e: print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) From 1902005f859ac0550725fa9eadc2c0a638f7b886 Mon Sep 17 00:00:00 2001 From: bradley Date: Sun, 15 Feb 2026 18:40:07 -0800 Subject: [PATCH 2/4] reworked OCR implementation to improve text quality --- requirements.txt | 3 +- src/preprocessing/pdf_text_extraction.py | 98 +++++++++++++++++------- 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/requirements.txt b/requirements.txt index ce0445c..443d114 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,5 @@ camelot-py[base] opencv-python pymupdf_layout xgboost -opencv-python \ No newline at end of file +opencv-python +pyspellchecker \ No newline at end of file diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index 21a4312..9222bb7 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -21,10 +21,28 @@ import camelot import cv2 import numpy as np +from spellchecker import SpellChecker Image.MAX_IMAGE_PIXELS = None fitz.TOOLS.mupdf_display_errors(False) +# Maximum allowed ratio of misspelled words to total words in a pdf +MAX_SPELLING_ERROR_RATE = 0.05 + + +def check_spelling(text: str) -> float: + """ + Returns ratio of mispelled words to total words + + Returns 1 if no words detected in input string + """ + spellChecker = SpellChecker() + words = spellChecker.split_words(text) + if (len(words) == 0): + return 1 + misspelled = spellChecker.unknown(words) + return (len(misspelled) / len(words)) + def extract_tables_with_camelot(pdf_path: str) -> List[Dict]: """Extract tables using camelot-py (fallback method). @@ -139,32 +157,52 @@ def extract_tables_from_pdf(pdf_path: str) -> List[Dict]: return tables_data +def parse_page_ocr(page: fitz.Page) -> str: + pix = page.get_pixmap(dpi=300) + img = Image.open(io.BytesIO(pix.tobytes("png"))) + img_array = np.array(img) + img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY) + img_array = cv2.fastNlMeansDenoising(img_array, h=10, templateWindowSize=7, searchWindowSize=21) + page_text = pytesseract.image_to_string(img_array) + return page_text + + +def parse_page_embedded(page: fitz.Page) -> str: + # Extract text from the page using PyMuPDF + page_text = page.get_text("text") + + # Clean out null bytes or UTF-16 artifacts + if "\x00" in page_text: + page_text = page_text.replace("\x00", "") + + return page_text + def extract_text_from_pdf(pdf_path: str) -> str: text = [] try: with fitz.open(pdf_path) as doc: for page_num, page in enumerate(doc, start=1): - # Extract text from the page using PyMuPDF - page_text = page.get_text("text") - - # Clean out null bytes or UTF-16 artifacts - if "\x00" in page_text: - page_text = page_text.replace("\x00", "") - - # If the page is mostly empty, treat as image and use OCR - if not page_text.strip(): - pix = page.get_pixmap(dpi=300) - img = Image.open(io.BytesIO(pix.tobytes("png"))) - img_array = np.array(img) - img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 15) - page_text = pytesseract.image_to_string(img_array) - text.append(page_text) + # try to extract embedded text first + text.append(parse_page_embedded(page)) + text = "\n".join(text) except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" - - return "\n".join(text) + + # If there are too many errors in the file + if (check_spelling(text) > MAX_SPELLING_ERROR_RATE): + text = [] + try: + with fitz.open(pdf_path) as doc: + for page_num, page in enumerate(doc, start=1): + # extract page text with OCR + text.append(parse_page_ocr(page)) + text = "\n".join(text) + except Exception as e: + print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) + return "" + return text def extract_text_from_pdf_bytes(data: bytes) -> str: @@ -173,20 +211,24 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: try: with fitz.open(stream=data, filetype="pdf") as doc: for page_num, page in enumerate(doc, start=1): - page_text = page.get_text("text") - if "\x00" in page_text: - page_text = page_text.replace("\x00", "") - if not page_text.strip(): - pix = page.get_pixmap(dpi=300) - img = Image.open(io.BytesIO(pix.tobytes("png"))) - img_array = np.array(img) - img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 15) - page_text = pytesseract.image_to_string(img_array) - text.append(page_text) + # try to extract embedded text first + text.append(parse_page_embedded(page)) + text = "\n".join(text) except Exception as e: print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) return "" - return "\n".join(text) + + if (check_spelling(text) > MAX_SPELLING_ERROR_RATE): + text = [] + try: + with fitz.open(stream=data, filetype="pdf") as doc: + for page_num, page in enumerate(doc, start=1): + text.append(parse_page_ocr(page)) + text = "\n".join(text) + except Exception as e: + print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) + return "" + return text def save_to_file(text: str, output_path: str): From 0f6ee38c0308161ab2642f9a7dc6796975cba2c6 Mon Sep 17 00:00:00 2001 From: bradley Date: Sun, 15 Feb 2026 19:13:19 -0800 Subject: [PATCH 3/4] Added docstrings to functions --- src/preprocessing/pdf_text_extraction.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index 9222bb7..89e3209 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -158,6 +158,7 @@ def extract_tables_from_pdf(pdf_path: str) -> List[Dict]: return tables_data def parse_page_ocr(page: fitz.Page) -> str: + """Extracts text from page using OCR""" pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) img_array = np.array(img) @@ -168,6 +169,7 @@ def parse_page_ocr(page: fitz.Page) -> str: def parse_page_embedded(page: fitz.Page) -> str: + """Extracts text embedded in a PDF page""" # Extract text from the page using PyMuPDF page_text = page.get_text("text") From 5bf631ccdbc8f773de8aa84143e7bae9fb45f524 Mon Sep 17 00:00:00 2001 From: bradley Date: Sun, 15 Feb 2026 20:01:04 -0800 Subject: [PATCH 4/4] Fixed formatting errors --- src/preprocessing/pdf_text_extraction.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index 89e3209..7f7d8ad 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -27,21 +27,21 @@ fitz.TOOLS.mupdf_display_errors(False) # Maximum allowed ratio of misspelled words to total words in a pdf -MAX_SPELLING_ERROR_RATE = 0.05 +MAX_SPELLING_ERROR_RATE = 0.05 def check_spelling(text: str) -> float: """ - Returns ratio of mispelled words to total words + Returns ratio of mispelled words to total words - Returns 1 if no words detected in input string + Returns 1 if no words detected in input string """ spellChecker = SpellChecker() words = spellChecker.split_words(text) - if (len(words) == 0): + if len(words) == 0: return 1 misspelled = spellChecker.unknown(words) - return (len(misspelled) / len(words)) + return len(misspelled) / len(words) def extract_tables_with_camelot(pdf_path: str) -> List[Dict]: @@ -157,6 +157,7 @@ def extract_tables_from_pdf(pdf_path: str) -> List[Dict]: return tables_data + def parse_page_ocr(page: fitz.Page) -> str: """Extracts text from page using OCR""" pix = page.get_pixmap(dpi=300) @@ -191,9 +192,9 @@ def extract_text_from_pdf(pdf_path: str) -> str: except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" - + # If there are too many errors in the file - if (check_spelling(text) > MAX_SPELLING_ERROR_RATE): + if check_spelling(text) > MAX_SPELLING_ERROR_RATE: text = [] try: with fitz.open(pdf_path) as doc: @@ -219,8 +220,8 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: except Exception as e: print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) return "" - - if (check_spelling(text) > MAX_SPELLING_ERROR_RATE): + + if check_spelling(text) > MAX_SPELLING_ERROR_RATE: text = [] try: with fitz.open(stream=data, filetype="pdf") as doc: