From 33cab3a10d615c5a320b991e849199d7eb1ea120 Mon Sep 17 00:00:00 2001
From: Bradley Rule <bradley.rule@yahoo.com>
Date: Sun, 18 Jan 2026 17:59:16 -0800
Subject: [PATCH 1/4] Added image denoising to OCR

---
 requirements.txt                         |  3 ++-
 src/preprocessing/pdf_text_extraction.py | 11 ++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b940946..92b8f54 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,5 @@ google-auth-oauthlib
 google-auth-httplib2
 numpy
 google-api-python-client
-xgboost
\ No newline at end of file
+xgboost
+opencv-python
\ No newline at end of file
diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py
index bbd9a6b..48e7c49 100644
--- a/src/preprocessing/pdf_text_extraction.py
+++ b/src/preprocessing/pdf_text_extraction.py
@@ -16,6 +16,8 @@
 import argparse
 from pathlib import Path
 import sys
+import cv2
+import numpy as np
 
 Image.MAX_IMAGE_PIXELS = None
 fitz.TOOLS.mupdf_display_errors(False)
@@ -38,8 +40,9 @@ def extract_text_from_pdf(pdf_path: str) -> str:
                 if not page_text.strip():
                     pix = page.get_pixmap(dpi=300)
                     img = Image.open(io.BytesIO(pix.tobytes("png")))
-                    page_text = pytesseract.image_to_string(img)
-
+                    img_array = np.array(img)
+                    img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 15)
+                    page_text = pytesseract.image_to_string(img_array)
                 text.append(page_text)
     except Exception as e:
         print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
@@ -60,7 +63,9 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
                 if not page_text.strip():
                     pix = page.get_pixmap(dpi=300)
                     img = Image.open(io.BytesIO(pix.tobytes("png")))
-                    page_text = pytesseract.image_to_string(img)
+                    img_array = np.array(img)
+                    img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 15)
+                    page_text = pytesseract.image_to_string(img_array)
                 text.append(page_text)
     except Exception as e:
         print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr)

From 1902005f859ac0550725fa9eadc2c0a638f7b886 Mon Sep 17 00:00:00 2001
From: bradley <bradley.rule@yahoo.com>
Date: Sun, 15 Feb 2026 18:40:07 -0800
Subject: [PATCH 2/4] reworked OCR implementation to improve text quality

---
 requirements.txt                         |  3 +-
 src/preprocessing/pdf_text_extraction.py | 98 +++++++++++++++++-------
 2 files changed, 72 insertions(+), 29 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ce0445c..443d114 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,4 +19,5 @@ camelot-py[base]
 opencv-python
 pymupdf_layout
 xgboost
-opencv-python
\ No newline at end of file
+opencv-python
+pyspellchecker
\ No newline at end of file
diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py
index 21a4312..9222bb7 100644
--- a/src/preprocessing/pdf_text_extraction.py
+++ b/src/preprocessing/pdf_text_extraction.py
@@ -21,10 +21,28 @@
 import camelot
 import cv2
 import numpy as np
+from spellchecker import SpellChecker
 
 Image.MAX_IMAGE_PIXELS = None
 fitz.TOOLS.mupdf_display_errors(False)
 
+# Maximum allowed ratio of misspelled words to total words in a pdf
+MAX_SPELLING_ERROR_RATE = 0.05       
+
+
+def check_spelling(text: str) -> float:
+    """
+        Returns ratio of mispelled words to total words
+
+        Returns 1 if no words detected in input string
+    """
+    spellChecker = SpellChecker()
+    words = spellChecker.split_words(text)
+    if (len(words) == 0):
+        return 1
+    misspelled = spellChecker.unknown(words)
+    return (len(misspelled) / len(words))
+
 
 def extract_tables_with_camelot(pdf_path: str) -> List[Dict]:
     """Extract tables using camelot-py (fallback method).
@@ -139,32 +157,52 @@ def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
 
     return tables_data
 
+def parse_page_ocr(page: fitz.Page) -> str:
+    pix = page.get_pixmap(dpi=300)
+    img = Image.open(io.BytesIO(pix.tobytes("png")))
+    img_array = np.array(img)
+    img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
+    img_array = cv2.fastNlMeansDenoising(img_array, h=10, templateWindowSize=7, searchWindowSize=21)
+    page_text = pytesseract.image_to_string(img_array)
+    return page_text
+
+
+def parse_page_embedded(page: fitz.Page) -> str:
+    # Extract text from the page using PyMuPDF
+    page_text = page.get_text("text")
+
+    # Clean out null bytes or UTF-16 artifacts
+    if "\x00" in page_text:
+        page_text = page_text.replace("\x00", "")
+
+    return page_text
+
 
 def extract_text_from_pdf(pdf_path: str) -> str:
     text = []
     try:
         with fitz.open(pdf_path) as doc:
             for page_num, page in enumerate(doc, start=1):
-                # Extract text from the page using PyMuPDF
-                page_text = page.get_text("text")
-
-                # Clean out null bytes or UTF-16 artifacts
-                if "\x00" in page_text:
-                    page_text = page_text.replace("\x00", "")
-
-                # If the page is mostly empty, treat as image and use OCR
-                if not page_text.strip():
-                    pix = page.get_pixmap(dpi=300)
-                    img = Image.open(io.BytesIO(pix.tobytes("png")))
-                    img_array = np.array(img)
-                    img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 15)
-                    page_text = pytesseract.image_to_string(img_array)
-                text.append(page_text)
+                # try to extract embedded text first
+                text.append(parse_page_embedded(page))
+            text = "\n".join(text)
     except Exception as e:
         print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
         return ""
-
-    return "\n".join(text)
+    
+    # If there are too many errors in the file
+    if (check_spelling(text) > MAX_SPELLING_ERROR_RATE):
+        text = []
+        try:
+            with fitz.open(pdf_path) as doc:
+                for page_num, page in enumerate(doc, start=1):
+                    # extract page text with OCR
+                    text.append(parse_page_ocr(page))
+                text = "\n".join(text)
+        except Exception as e:
+            print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
+            return ""
+    return text
 
 
 def extract_text_from_pdf_bytes(data: bytes) -> str:
@@ -173,20 +211,24 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
     try:
         with fitz.open(stream=data, filetype="pdf") as doc:
             for page_num, page in enumerate(doc, start=1):
-                page_text = page.get_text("text")
-                if "\x00" in page_text:
-                    page_text = page_text.replace("\x00", "")
-                if not page_text.strip():
-                    pix = page.get_pixmap(dpi=300)
-                    img = Image.open(io.BytesIO(pix.tobytes("png")))
-                    img_array = np.array(img)
-                    img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 15)
-                    page_text = pytesseract.image_to_string(img_array)
-                text.append(page_text)
+                # try to extract embedded text first
+                text.append(parse_page_embedded(page))
+            text = "\n".join(text)
     except Exception as e:
         print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr)
         return ""
-    return "\n".join(text)
+    
+    if (check_spelling(text) > MAX_SPELLING_ERROR_RATE):
+        text = []
+        try:
+            with fitz.open(stream=data, filetype="pdf") as doc:
+                for page_num, page in enumerate(doc, start=1):
+                    text.append(parse_page_ocr(page))
+                text = "\n".join(text)
+        except Exception as e:
+            print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr)
+            return ""
+    return text
 
 
 def save_to_file(text: str, output_path: str):

From 0f6ee38c0308161ab2642f9a7dc6796975cba2c6 Mon Sep 17 00:00:00 2001
From: bradley <bradley.rule@yahoo.com>
Date: Sun, 15 Feb 2026 19:13:19 -0800
Subject: [PATCH 3/4] Added docstrings to functions

---
 src/preprocessing/pdf_text_extraction.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py
index 9222bb7..89e3209 100644
--- a/src/preprocessing/pdf_text_extraction.py
+++ b/src/preprocessing/pdf_text_extraction.py
@@ -158,6 +158,7 @@ def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
     return tables_data
 
 def parse_page_ocr(page: fitz.Page) -> str:
+    """Extracts text from page using OCR"""
     pix = page.get_pixmap(dpi=300)
     img = Image.open(io.BytesIO(pix.tobytes("png")))
     img_array = np.array(img)
@@ -168,6 +169,7 @@ def parse_page_ocr(page: fitz.Page) -> str:
 
 
 def parse_page_embedded(page: fitz.Page) -> str:
+    """Extracts text embedded in a PDF page"""
     # Extract text from the page using PyMuPDF
     page_text = page.get_text("text")
 

From 5bf631ccdbc8f773de8aa84143e7bae9fb45f524 Mon Sep 17 00:00:00 2001
From: bradley <bradley.rule@yahoo.com>
Date: Sun, 15 Feb 2026 20:01:04 -0800
Subject: [PATCH 4/4] Fixed formatting errors

---
 src/preprocessing/pdf_text_extraction.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py
index 89e3209..7f7d8ad 100644
--- a/src/preprocessing/pdf_text_extraction.py
+++ b/src/preprocessing/pdf_text_extraction.py
@@ -27,21 +27,21 @@
 fitz.TOOLS.mupdf_display_errors(False)
 
 # Maximum allowed ratio of misspelled words to total words in a pdf
-MAX_SPELLING_ERROR_RATE = 0.05       
+MAX_SPELLING_ERROR_RATE = 0.05
 
 
 def check_spelling(text: str) -> float:
     """
-        Returns ratio of mispelled words to total words
+    Returns ratio of mispelled words to total words
 
-        Returns 1 if no words detected in input string
+    Returns 1 if no words detected in input string
     """
     spellChecker = SpellChecker()
     words = spellChecker.split_words(text)
-    if (len(words) == 0):
+    if len(words) == 0:
         return 1
     misspelled = spellChecker.unknown(words)
-    return (len(misspelled) / len(words))
+    return len(misspelled) / len(words)
 
 
 def extract_tables_with_camelot(pdf_path: str) -> List[Dict]:
@@ -157,6 +157,7 @@ def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
 
     return tables_data
 
+
 def parse_page_ocr(page: fitz.Page) -> str:
     """Extracts text from page using OCR"""
     pix = page.get_pixmap(dpi=300)
@@ -191,9 +192,9 @@ def extract_text_from_pdf(pdf_path: str) -> str:
     except Exception as e:
         print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
         return ""
-    
+
     # If there are too many errors in the file
-    if (check_spelling(text) > MAX_SPELLING_ERROR_RATE):
+    if check_spelling(text) > MAX_SPELLING_ERROR_RATE:
         text = []
         try:
             with fitz.open(pdf_path) as doc:
@@ -219,8 +220,8 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
     except Exception as e:
         print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr)
         return ""
-    
-    if (check_spelling(text) > MAX_SPELLING_ERROR_RATE):
+
+    if check_spelling(text) > MAX_SPELLING_ERROR_RATE:
         text = []
         try:
             with fitz.open(stream=data, filetype="pdf") as doc: