NovakLabOSU · bradleyrule · Jan 19, 2026 · Feb 13, 2026 · Feb 16, 2026 · Feb 16, 2026
diff --git a/requirements.txt b/requirements.txt
@@ -18,4 +18,6 @@ pydantic
 camelot-py[base]
 opencv-python
 pymupdf_layout
-xgboost
+xgboost
+opencv-python
+pyspellchecker
diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py
@@ -19,10 +19,30 @@
 import json
 from typing import List, Dict
 import camelot
+import cv2
+import numpy as np
+from spellchecker import SpellChecker
 
 Image.MAX_IMAGE_PIXELS = None
 fitz.TOOLS.mupdf_display_errors(False)
 
+# Maximum allowed ratio of misspelled words to total words in a pdf
+MAX_SPELLING_ERROR_RATE = 0.05
+
+
+def check_spelling(text: str) -> float:
+    """
+    Returns ratio of mispelled words to total words
+
+    Returns 1 if no words detected in input string
+    """
+    spellChecker = SpellChecker()
+    words = spellChecker.split_words(text)
+    if len(words) == 0:
+        return 1
+    misspelled = spellChecker.unknown(words)
+    return len(misspelled) / len(words)
+
 
 def extract_tables_with_camelot(pdf_path: str) -> List[Dict]:
     """Extract tables using camelot-py (fallback method).
@@ -138,30 +158,54 @@ def extract_tables_from_pdf(pdf_path: str) -> List[Dict]:
     return tables_data
 
 
+def parse_page_ocr(page: fitz.Page) -> str:
+    """Extracts text from page using OCR"""
+    pix = page.get_pixmap(dpi=300)
+    img = Image.open(io.BytesIO(pix.tobytes("png")))
+    img_array = np.array(img)
+    img_array = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
+    img_array = cv2.fastNlMeansDenoising(img_array, h=10, templateWindowSize=7, searchWindowSize=21)
+    page_text = pytesseract.image_to_string(img_array)
+    return page_text
+
+
+def parse_page_embedded(page: fitz.Page) -> str:
+    """Extracts text embedded in a PDF page"""
+    # Extract text from the page using PyMuPDF
+    page_text = page.get_text("text")
+
+    # Clean out null bytes or UTF-16 artifacts
+    if "\x00" in page_text:
+        page_text = page_text.replace("\x00", "")
+
+    return page_text
+
+
 def extract_text_from_pdf(pdf_path: str) -> str:
     text = []
     try:
         with fitz.open(pdf_path) as doc:
             for page_num, page in enumerate(doc, start=1):
-                # Extract text from the page using PyMuPDF
-                page_text = page.get_text("text")
-
-                # Clean out null bytes or UTF-16 artifacts
-                if "\x00" in page_text:
-                    page_text = page_text.replace("\x00", "")
-
-                # If the page is mostly empty, treat as image and use OCR
-                if not page_text.strip():
-                    pix = page.get_pixmap(dpi=300)
-                    img = Image.open(io.BytesIO(pix.tobytes("png")))
-                    page_text = pytesseract.image_to_string(img)
-
-                text.append(page_text)
+                # try to extract embedded text first
+                text.append(parse_page_embedded(page))
+            text = "\n".join(text)
     except Exception as e:
         print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
         return ""
 
-    return "\n".join(text)
+    # If there are too many errors in the file
+    if check_spelling(text) > MAX_SPELLING_ERROR_RATE:
+        text = []
+        try:
+            with fitz.open(pdf_path) as doc:
+                for page_num, page in enumerate(doc, start=1):
+                    # extract page text with OCR
+                    text.append(parse_page_ocr(page))
+                text = "\n".join(text)
+        except Exception as e:
+            print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr)
+            return ""
+    return text
 
 
 def extract_text_from_pdf_bytes(data: bytes) -> str:
@@ -170,18 +214,24 @@ def extract_text_from_pdf_bytes(data: bytes) -> str:
     try:
         with fitz.open(stream=data, filetype="pdf") as doc:
             for page_num, page in enumerate(doc, start=1):
-                page_text = page.get_text("text")
-                if "\x00" in page_text:
-                    page_text = page_text.replace("\x00", "")
-                if not page_text.strip():
-                    pix = page.get_pixmap(dpi=300)
-                    img = Image.open(io.BytesIO(pix.tobytes("png")))
-                    page_text = pytesseract.image_to_string(img)
-                text.append(page_text)
+                # try to extract embedded text first
+                text.append(parse_page_embedded(page))
+            text = "\n".join(text)
     except Exception as e:
         print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr)
         return ""
-    return "\n".join(text)
+
+    if check_spelling(text) > MAX_SPELLING_ERROR_RATE:
+        text = []
+        try:
+            with fitz.open(stream=data, filetype="pdf") as doc:
+                for page_num, page in enumerate(doc, start=1):
+                    text.append(parse_page_ocr(page))
+                text = "\n".join(text)
+        except Exception as e:
+            print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr)
+            return ""
+    return text
 
 
 def save_to_file(text: str, output_path: str):