diff --git a/data/results/Adams_1989_results.json b/data/results/Adams_1989_results.json new file mode 100644 index 0000000..70d465e --- /dev/null +++ b/data/results/Adams_1989_results.json @@ -0,0 +1,17 @@ +{ + "source_file": "Adams_1989.txt", + "metrics": { + "species_name": "Gentoo Penguin", + "study_location": "Marion Island", + "study_date": "1984-1985", + "num_empty_stomachs": null, + "num_nonempty_stomachs": null, + "sample_size": 144, + "fraction_feeding": null, + "source_pages": [ + 1, + 2, + 6 + ] + } +} \ No newline at end of file diff --git a/data/results/Fisher_2008_results.json b/data/results/Fisher_2008_results.json new file mode 100644 index 0000000..f6bc255 --- /dev/null +++ b/data/results/Fisher_2008_results.json @@ -0,0 +1,16 @@ +{ + "source_file": "Fisher_2008.txt", + "metrics": { + "species_name": "Nucella lapillus", + "study_location": "Swans Island, Maine, USA", + "study_date": "2004-2005", + "num_empty_stomachs": null, + "num_nonempty_stomachs": 15, + "sample_size": 225, + "fraction_feeding": 0.0667, + "source_pages": [ + 1, + 2 + ] + } +} \ No newline at end of file diff --git a/scripts/full_pipeline.py b/scripts/full_pipeline.py index 95c2de2..f7ff28f 100644 --- a/scripts/full_pipeline.py +++ b/scripts/full_pipeline.py @@ -79,7 +79,7 @@ def process_api_mode(): out_dir = Path("data/processed-text") out_dir.mkdir(parents=True, exist_ok=True) labels: Dict[str, str] = {} - count=1 + count = 1 for folder_id, label in [(useful_id, "useful"), (not_useful_id, "not-useful")]: files = list_pdfs_in_folder(service, folder_id, max_files=None) print(f"Found {len(files)} PDFs in folder label '{label}'") @@ -91,7 +91,7 @@ def process_api_mode(): (out_dir / txt_name).write_text(text, encoding="utf-8") labels[txt_name] = label print(f"{count} Processed {f['name']}") - count+=1 + count += 1 write_labels(labels, Path("data/labels.json")) print(f"Wrote {len(labels)} labeled text files.") @@ -101,23 +101,23 @@ def process_local_mode(data_path: Path): """Process PDFs from local directory.""" if not data_path.exists(): raise RuntimeError(f"Data path does not exist: {data_path}") - + useful_dir = data_path / "useful" not_useful_dir = data_path / "not-useful" - + if not useful_dir.exists(): raise RuntimeError(f"'useful' subfolder not found in {data_path}") if not not_useful_dir.exists(): raise RuntimeError(f"'not-useful' subfolder not found in {data_path}") - + out_dir = Path("data/processed-text") out_dir.mkdir(parents=True, exist_ok=True) labels: Dict[str, str] = {} - + for folder, label in [(useful_dir, "useful"), (not_useful_dir, "not-useful")]: pdf_files = list(folder.glob("*.pdf")) print(f"Found {len(pdf_files)} PDFs in local folder '{label}'") - + for pdf_path in pdf_files: try: with open(pdf_path, "rb") as f: @@ -131,7 +131,7 @@ def process_local_mode(data_path: Path): except Exception as e: print(f"Error processing {pdf_path.name}: {e}") continue - + write_labels(labels, Path("data/labels.json")) print(f"Wrote {len(labels)} labeled text files.") @@ -144,32 +144,23 @@ def main(): Examples: API mode: python full_pipeline.py --api Local mode: python full_pipeline.py --local ./data/pdfs - """ + """, ) - + # Create mutually exclusive group for --api and --local group = parser.add_mutually_exclusive_group(required=True) - group.add_argument( - "--api", - action="store_true", - help="Use API mode to download PDFs from Google Drive" - ) - group.add_argument( - "--local", - type=Path, - metavar="PATH", - help="Use local mode with PDFs from specified directory (should contain 'useful' and 'not-useful' subfolders)" - ) - + group.add_argument("--api", action="store_true", help="Use API mode to download PDFs from Google Drive") + group.add_argument("--local", type=Path, metavar="PATH", help="Use local mode with PDFs from specified directory (should contain 'useful' and 'not-useful' subfolders)") + args = parser.parse_args() - + if args.local: print(f"Running in LOCAL mode with data path: {args.local}") process_local_mode(args.local) else: # args.api print("Running in API mode (Google Drive)") process_api_mode() - + print("Beginning model training...") run([sys.executable, "src/model/train_model.py"]) print("Training complete.") diff --git a/src/llm/llm_client.py b/src/llm/llm_client.py new file mode 100644 index 0000000..e55ace2 --- /dev/null +++ b/src/llm/llm_client.py @@ -0,0 +1,190 @@ +"""LLM-based metric extraction from scientific publications. + +Usage: + python llm_client.py path/to/file.pdf + python llm_client.py path/to/file.txt + python llm_client.py path/to/file.pdf --model llama3.1:8b + python llm_client.py path/to/file.txt --output-dir results/ + +This script uses Ollama to extract structured data from predator diet surveys. +It can read PDFs directly (with automatic OCR for scanned pages) or preprocessed +text files. Extracted data includes species name, study date, location, and +stomach content metrics. +""" + +import argparse +import json +import sys +import re +from pathlib import Path + +from ollama import chat + +from models import PredatorDietMetrics +from llm_text import extract_key_sections, load_document + + +def extract_metrics_from_text(text: str, model: str = "llama3.1:8b", num_ctx: int = 4096) -> PredatorDietMetrics: + """Extract structured metrics from text using Ollama. + + Args: + text: Preprocessed text content from a scientific publication + model: Name of the Ollama model to use + num_ctx: Context window size to request from Ollama (lower = less memory) + + Returns: + PredatorDietMetrics object with extracted data + """ + prompt = f"""You are a scientific data extraction assistant. Your task is to read a predator diet survey publication and return a single flat JSON object with exactly these fields: + + species_name - string or null + study_location - string or null + study_date - string or null + num_empty_stomachs - integer (>= 0) or null + num_nonempty_stomachs - integer (>= 0) or null + sample_size - integer (> 0) or null + +Use null for any field whose value cannot be confidently determined from the text. + +FIELD DEFINITIONS + +species_name: Binomial Latin name (Genus species) of the PRIMARY PREDATOR whose diet is studied. This is the animal whose stomachs/guts were examined, not its prey. Return exactly one species. If multiple predators are studied, choose the one with the most stomach samples. Capitalize the genus, lowercase the specific epithet (e.g., "Pygoscelis papua"). + +study_location: Geographic area where predator specimens were collected. Include site, region, and country if available (e.g., "Marion Island, sub-Antarctic"). Check Methods, Study Area, or Study Site sections. + +study_date: Year or year-range of specimen collection, NOT publication year. Format "YYYY" or "YYYY-YYYY". Look for phrases like "specimens collected in", "sampling period", "field season", "between [year] and [year]". Return null if only publication year is visible. + +num_empty_stomachs: Number of predators with stomachs containing no food. Synonyms: "empty", "vacant", "without food", "zero prey items", "stomachs with no contents", "N individuals had empty stomachs". + +num_nonempty_stomachs: Number of predators with stomachs containing food. Synonyms: "non-empty", "with food", "containing prey", "with contents", "fed", "N contained food", "N had prey items". + +sample_size: Total number of predator individuals examined. When both num_empty_stomachs and num_nonempty_stomachs are available, sample_size equals their sum. Look for phrases like "N stomachs were examined", "a total of N individuals", "N specimens", "n=", "sample size of N". + +RULES +- Do not invent data; use null if ambiguous or missing. +- Return a single JSON object; do not return arrays. +- Ignore page markers [PAGE N]. +- Prioritize Abstract, Methods, and Results sections. +- Be especially careful to distinguish collection dates from publication dates. + +EXAMPLES + +1. Simple complete case: +{{"species_name": "Pygoscelis papua", "study_location": "Marion Island, sub-Antarctic", "study_date": "1984-1985", "num_empty_stomachs": 5, "num_nonempty_stomachs": 15, "sample_size": 20}} + +2. Missing empty stomach data (can infer from sample_size): +{{"species_name": "Canis lupus", "study_location": "Yellowstone National Park, Wyoming, USA", "study_date": "2019", "num_empty_stomachs": null, "num_nonempty_stomachs": 47, "sample_size": 52}} + +3. Multi-year study: +{{"species_name": "Vulpes vulpes", "study_location": "Bristol, UK", "study_date": "2015-2018", "num_empty_stomachs": 12, "num_nonempty_stomachs": 88, "sample_size": 100}} + +4. Minimal data available: +{{"species_name": "Ursus arctos", "study_location": null, "study_date": "2020", "num_empty_stomachs": null, "num_nonempty_stomachs": null, "sample_size": 23}} + +5. Only some fields extractable: +{{"species_name": "Zalophus californianus", "study_location": "California coast", "study_date": null, "num_empty_stomachs": 8, "num_nonempty_stomachs": 34, "sample_size": 42}} + +TEXT +{text} +""" + # Ollama call with structured schema output + response = chat( + messages=[ + { + 'role': 'user', + 'content': prompt, + } + ], + model=model, + format=PredatorDietMetrics.model_json_schema(), + ) + + metrics = PredatorDietMetrics.model_validate_json(response.message.content) + return metrics + + +def main(): + parser = argparse.ArgumentParser(description="Extract predator diet metrics from PDFs or text files using LLM") + parser.add_argument("input_file", type=str, help="Path to the input file (.pdf or .txt)") + parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)") + parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results)") + parser.add_argument("--max-chars", type=int, default=12000, help="Maximum characters of text to send to the model (default: 12000). " "Reduce if you hit CUDA/OOM errors.") + parser.add_argument("--num-ctx", type=int, default=4096, help="Context window size for the model (default: 4096). " "Lower values use less memory.") + + args = parser.parse_args() + + # Validate input file + input_path = Path(args.input_file) + if not input_path.exists(): + print(f"[ERROR] File not found: {input_path}", file=sys.stderr) + sys.exit(1) + + # Load document (PDF or text) + print(f"Processing {input_path.name}...", file=sys.stderr) + try: + text = load_document(input_path) + except Exception as e: + print(f"[ERROR] Failed to load file: {e}", file=sys.stderr) + sys.exit(1) + + # Store original text for page extraction + original_text = text + print(f"[INFO] Text size: {len(text)} chars", file=sys.stderr) + + # Extract key sections if text is too long + if len(text) > args.max_chars: + text = extract_key_sections(text, args.max_chars) + print(f"[INFO] Extracted key sections: {len(text)} chars (budget {args.max_chars})", file=sys.stderr) + + # Extract metrics using LLM + print(f"[INFO] Extracting metrics with {args.model}...", file=sys.stderr) + try: + metrics = extract_metrics_from_text(text, model=args.model, num_ctx=args.num_ctx) + except Exception as e: + print(f"[ERROR] Extraction failed: {e}", file=sys.stderr) + sys.exit(1) + + # Convert to dictionary + metrics_dict = metrics.model_dump() + + # Extract page numbers programmatically from where data was found + source_pages: set[int] = set() + _skip_fields = {"fraction_feeding", "source_pages"} + for field_name, value in metrics_dict.items(): + if value is not None and field_name not in _skip_fields: + value_str = str(value) + if value_str in original_text: + pos = original_text.find(value_str) + page_markers = re.findall(r'\[PAGE (\d+)\]', original_text[:pos]) + if page_markers: + source_pages.add(int(page_markers[-1])) + + metrics_dict["source_pages"] = sorted(source_pages) if source_pages else None + + # Prepare output + result = {"source_file": input_path.name, "file_type": input_path.suffix.lower(), "metrics": metrics_dict} + + # Generate output filename: input_name_results.json + output_filename = input_path.stem + "_results.json" + output_path = Path(args.output_dir) / output_filename + + # Save results + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(result, f, indent=2) + + print(f"[SUCCESS] Results saved to {output_path}", file=sys.stderr) + + # Print summary + print("\n=== Extraction Summary ===", file=sys.stderr) + print(f"Species: {metrics_dict.get('species_name', 'N/A')}", file=sys.stderr) + print(f"Location: {metrics_dict.get('study_location', 'N/A')}", file=sys.stderr) + print(f"Date: {metrics_dict.get('study_date', 'N/A')}", file=sys.stderr) + print(f"Sample size: {metrics_dict.get('sample_size', 'N/A')}", file=sys.stderr) + print(f"Empty stomachs: {metrics_dict.get('num_empty_stomachs', 'N/A')}", file=sys.stderr) + print(f"Non-empty stomachs: {metrics_dict.get('num_nonempty_stomachs', 'N/A')}", file=sys.stderr) + print(f"Fraction feeding: {metrics_dict.get('fraction_feeding', 'N/A')}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/src/llm/llm_text.py b/src/llm/llm_text.py new file mode 100644 index 0000000..d311abd --- /dev/null +++ b/src/llm/llm_text.py @@ -0,0 +1,155 @@ +"""Text preprocessing and section extraction utilities. + +This module handles intelligent extraction of key sections from scientific papers, +prioritizing the most informative content when text must be truncated to fit +within LLM context windows. +""" + +import re +import sys +from pathlib import Path +from typing import List, Tuple + +# Add project root to path +project_root = Path(__file__).parent.parent.parent +sys.path.insert(0, str(project_root)) + +from src.preprocessing.pdf_text_extraction import extract_text_from_pdf + +# Section headers commonly found in scientific diet / stomach-content papers. +# Order matters: earlier entries are higher priority when budget is tight. +SECTION_PATTERNS: List[re.Pattern[str]] = [ + re.compile(r"(?i)^\s*(?:abstract|summary)"), + re.compile(r"(?i)^\s*(?:results?)\b"), + re.compile(r"(?i)^\s*(?:methods?|materials?\s*(?:and|&)\s*methods?|study\s*(?:area|site))"), + re.compile(r"(?i)^\s*(?:table)\s*\d"), + re.compile(r"(?i)^\s*(?:introduction|background)"), + re.compile(r"(?i)^\s*(?:discussion)"), +] + +# Sections that are almost never useful for metric extraction. +SKIP_PATTERNS: List[re.Pattern[str]] = [ + re.compile(r"(?i)^\s*(?:acknowledge|literature\s*cited|references|bibliography|appendix|supplementary)"), +] + + +def split_into_pages(text: str) -> List[Tuple[int, str]]: + """Split text on ``[PAGE N]`` markers. + + Args: + text: Input text with [PAGE N] markers + + Returns: + List of (page_number, page_text) tuples + """ + parts = re.split(r"\[PAGE\s+(\d+)\]", text) + # parts: [before_first_marker, page_num, page_text, page_num, page_text, ...] + pages: List[Tuple[int, str]] = [] + if parts[0].strip(): + pages.append((0, parts[0])) + for i in range(1, len(parts), 2): + page_num = int(parts[i]) + page_text = parts[i + 1] if i + 1 < len(parts) else "" + pages.append((page_num, page_text)) + return pages + + +def classify_page(page_text: str) -> Tuple[bool, int]: + """Determine if a page is useful and assign a priority score. + + Args: + page_text: Text content of the page + + Returns: + Tuple of (is_useful, priority) where lower priority number means higher importance. + Pages matching skip patterns return (False, 999). + Pages with no recognized header get default mid-priority. + """ + for pat in SKIP_PATTERNS: + if pat.search(page_text): + return False, 999 + for idx, pat in enumerate(SECTION_PATTERNS): + if pat.search(page_text): + return True, idx + # No recognised header — still potentially useful (e.g. tables without + # a "Table" header, continuation of Results, etc.) + return True, len(SECTION_PATTERNS) + + +def extract_key_sections(text: str, max_chars: int) -> str: + """Return the most informative portion of text within the character budget. + + Strategy: + 1. Split the paper into pages using [PAGE N] markers + 2. Drop pages belonging to References/Acknowledgements/Appendix + 3. Rank remaining pages by section priority: + Abstract > Results > Methods > Tables > Introduction > Discussion > other + 4. Greedily pack pages in priority order until the budget is spent + 5. Re-order selected pages by their original page number so the LLM + sees them in reading order + + Args: + text: Full text of the document + max_chars: Maximum character budget for the output + + Returns: + Extracted text containing the most relevant sections within the budget. + If the full text fits within max_chars, it is returned as-is. + """ + if len(text) <= max_chars: + return text + + pages = split_into_pages(text) + scored: List[Tuple[int, int, str]] = [] # (priority, page_num, page_text) + for page_num, page_text in pages: + useful, priority = classify_page(page_text) + if useful: + scored.append((priority, page_num, page_text)) + + # Sort by priority (ascending = most important first) + scored.sort(key=lambda t: t[0]) + + selected: List[Tuple[int, str]] = [] + budget = max_chars + for _priority, page_num, page_text in scored: + page_with_marker = f"[PAGE {page_num}]\n{page_text}" + if len(page_with_marker) <= budget: + selected.append((page_num, page_with_marker)) + budget -= len(page_with_marker) + elif budget > 200: + # Partially include the page up to the remaining budget + selected.append((page_num, page_with_marker[:budget])) + budget = 0 + break + + # Re-sort by page number so the LLM sees content in reading order + selected.sort(key=lambda t: t[0]) + return "\n".join(chunk for _, chunk in selected) + + +def load_document(file_path: Path) -> str: + """Load document from PDF or text file. + + Args: + file_path: Path to the input file (.pdf or .txt) + + Returns: + Extracted text content with [PAGE N] markers + + Raises: + RuntimeError: If file reading fails + """ + suffix = file_path.suffix.lower() + + if suffix == '.pdf': + print(f"[INFO] Reading PDF file...", file=sys.stderr) + return extract_text_from_pdf(str(file_path)) + elif suffix in ['.txt', '.text']: + print(f"[INFO] Reading text file...", file=sys.stderr) + try: + with open(file_path, "r", encoding="utf-8") as f: + return f.read() + except UnicodeDecodeError as e: + raise RuntimeError(f"Text file encoding error: {e}") + else: + raise RuntimeError(f"Unsupported file type: {suffix}. Use .pdf or .txt files.") diff --git a/src/llm/local_llm.py b/src/llm/local_llm.py deleted file mode 100644 index 981ab46..0000000 --- a/src/llm/local_llm.py +++ /dev/null @@ -1,167 +0,0 @@ -"""LLM-based metric extraction from preprocessed text files. - -Usage: - python extract_metrics.py path/to/text_file.txt - python extract_metrics.py path/to/text_file.txt --model llama3.1:8b - python extract_metrics.py path/to/text_file.txt --output-dir results/ - -This script uses Ollama to extract structured data from preprocessed predator diet -surveys, including species name, study date, location, and stomach content data. -""" - -import argparse -import json -import sys -from pathlib import Path -from typing import Optional - -from ollama import chat -from pydantic import BaseModel, Field - - -class PredatorDietMetrics(BaseModel): - """Structured schema for extracted predator diet survey metrics.""" - - species_name: Optional[str] = Field(None, description="Scientific name of the predator species studied") - study_location: Optional[str] = Field(None, description="Geographic location where the study was conducted") - study_date: Optional[str] = Field(None, description="Year or date range when the study was conducted") - num_empty_stomachs: Optional[int] = Field(None, description="Number of predators with empty stomachs") - num_nonempty_stomachs: Optional[int] = Field(None, description="Number of predators with non-empty stomachs") - sample_size: Optional[int] = Field(None, description="Total number of predators surveyed") - - -def extract_metrics_from_text(text: str, model: str = "llama3.1:8b") -> PredatorDietMetrics: - """Extract structured metrics from text using Ollama. - - Args: - text: Preprocessed text content from a scientific publication - model: Name of the Ollama model to use - - Returns: - PredatorDietMetrics object with extracted data - """ - prompt = f"""You are a scientific data extraction assistant specializing in predator diet surveys. - -Extract specific metrics from the text below. Focus on stomach content data where: -- EMPTY stomachs = no food/prey -- NON-EMPTY stomachs = contained food/prey -- SAMPLE SIZE = total number of predators examined - -KEY INFORMATION TO FIND: -- Species names are in Latin format (Genus species) -- Look in tables, methods, and results sections -- Empty stomachs: "empty", "vacant", "no prey" -- Non-empty stomachs: "with prey", "fed", "containing food" - -EXTRACT: -- species_name: Scientific name of PRIMARY predator studied (not prey) -- study_location: Geographic location of sampling -- study_date: Year or date range of collection -- num_empty_stomachs: Number with empty stomachs -- num_nonempty_stomachs: Number with food in stomachs -- sample_size: Total number examined - - -TEXT: -{text} -""" - # Ollama call with structured schema output - response = chat( - messages=[ - { - 'role': 'user', - 'content': prompt, - } - ], - model=model, - format=PredatorDietMetrics.model_json_schema(), - ) - - metrics = PredatorDietMetrics.model_validate_json(response.message.content) - return metrics - - -def validate_and_calculate(metrics: dict) -> dict: - """Validate extracted metrics and calculate derived values. - - Args: - metrics: Dictionary of extracted metrics - - Returns: - Dictionary with validated metrics and calculated fraction_feeding - """ - empty = metrics.get("num_empty_stomachs") - nonempty = metrics.get("num_nonempty_stomachs") - sample = metrics.get("sample_size") - - # Validate and fix sample size if needed - if empty is not None and nonempty is not None: - calculated_sample = empty + nonempty - if sample is None: - metrics["sample_size"] = calculated_sample - sample = calculated_sample - elif sample != calculated_sample: - # LLM made an error, use calculated value - metrics["sample_size"] = calculated_sample - sample = calculated_sample - - # Calculate fraction of feeding predators - fraction_feeding = None - if nonempty is not None and sample is not None and sample > 0: - fraction_feeding = round(nonempty / sample, 4) - - metrics["fraction_feeding"] = fraction_feeding - - return metrics - - -def main(): - parser = argparse.ArgumentParser(description="Extract predator diet metrics from preprocessed text using LLM") - parser.add_argument("text_file", type=str, help="Path to the preprocessed text file") - parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)") - parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results)") - - args = parser.parse_args() - - # Load text file - text_path = Path(args.text_file) - if not text_path.exists(): - print(f"[ERROR] File not found: {text_path}", file=sys.stderr) - sys.exit(1) - - try: - with open(text_path, "r", encoding="utf-8") as f: - text = f.read() - except Exception as e: - print(f"[ERROR] Failed to read file: {e}", file=sys.stderr) - sys.exit(1) - - # Extract metrics - print(f"Extracting metrics from {text_path.name}...", file=sys.stderr) - try: - metrics = extract_metrics_from_text(text, model=args.model) - except Exception as e: - print(f"[ERROR] Extraction failed: {e}", file=sys.stderr) - sys.exit(1) - - # Validate and calculate derived metrics - metrics_dict = metrics.model_dump() - metrics_dict = validate_and_calculate(metrics_dict) - - # Prepare output - result = {"source_file": text_path.name, "metrics": metrics_dict} - - # Generate output filename: input_name_results.json - output_filename = text_path.stem + "_results.json" - output_path = Path(args.output_dir) / output_filename - - # Save results - output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, "w", encoding="utf-8") as f: - json.dump(result, f, indent=2) - - print(f"Results saved to {output_path}", file=sys.stderr) - - -if __name__ == "__main__": - main() diff --git a/src/llm/models.py b/src/llm/models.py new file mode 100644 index 0000000..9d05a2d --- /dev/null +++ b/src/llm/models.py @@ -0,0 +1,121 @@ +"""Pydantic models for predator diet data extraction.""" + +from typing import Annotated, Optional +from pydantic import BaseModel, ConfigDict, Field, NonNegativeInt, computed_field, constr, model_validator + + +class PredatorDietMetrics(BaseModel): + """Structured schema for extracted predator diet survey metrics. + + All count fields are non-negative integers. When both + ``num_empty_stomachs`` and ``num_nonempty_stomachs`` are present the + model guarantees that ``sample_size`` equals their sum. + ``fraction_feeding`` is derived automatically from the validated counts. + """ + + model_config = ConfigDict( + strict=True, + validate_default=True, + str_strip_whitespace=True, + frozen=False, + ) + + species_name: Optional[ + Annotated[ + str, + constr(min_length=3, max_length=200, pattern=r"^[A-Z][a-z]+(\s[a-z]+)*$"), + ] + ] = Field( + default=None, + description=( + "Binomial scientific name of the PRIMARY PREDATOR species studied " + "(the animal whose stomachs were examined, not its prey). " + "Return exactly one species. If multiple predators are studied, " + "choose the one with the most stomach samples. " + "Format: Capitalize genus, lowercase specific epithet (e.g., 'Canis lupus', 'Pygoscelis papua')." + ), + examples=["Canis lupus", "Vulpes vulpes", "Pygoscelis papua", "Ursus arctos"], + ) + + study_location: Optional[Annotated[str, constr(min_length=1, max_length=500)]] = Field( + default=None, + description=( + "Geographic area where predator specimens were collected. " + "Include site, region, and country if available. " + "Common section locations: Methods, Study Area, Study Site, or Materials and Methods. " + "Examples: 'Marion Island, sub-Antarctic', 'Yellowstone National Park, Wyoming, USA', 'Bristol, UK'." + ), + examples=["Yellowstone National Park, Wyoming, USA", "Marion Island, sub-Antarctic", "Bristol, UK"], + ) + + study_date: Optional[ + Annotated[ + str, + constr( + min_length=4, + max_length=30, + pattern=r"^\d{4}([\-—]\d{4})?$", + ), + ] + ] = Field( + default=None, + description=( + "Year or year-range when specimens were COLLECTED (not publication year). " + "Format: 'YYYY' for single year or 'YYYY-YYYY' for range (e.g., '2019' or '2019-2021'). " + "Common phrasings: 'specimens collected in', 'sampling period', 'field season', " + "'between [year] and [year]', 'during [year]', 'from [year] to [year]'. " + ), + examples=["2019", "2019-2021", "1984-1985", "2015-2018"], + ) + + num_empty_stomachs: Optional[NonNegativeInt] = Field( + default=None, + description=( + "Number of predators with empty stomachs (no food present). " + "Common phrasings: 'empty', 'vacant', 'without food', 'zero prey items', " + "'no contents', 'stomachs with no contents', 'N individuals had empty stomachs', " + "'N empty', 'N with no prey'. Must be >= 0." + ), + ) + + num_nonempty_stomachs: Optional[NonNegativeInt] = Field( + default=None, + description=( + "Number of predators with non-empty (food-containing) stomachs. " + "Common phrasings: 'non-empty', 'with food', 'containing prey', 'with contents', " + "'fed', 'N contained food', 'N had prey items', 'N with prey'. Must be >= 0." + ), + ) + + sample_size: Optional[Annotated[int, Field(gt=0)]] = Field( + default=None, + description=( + "Total number of predator individuals whose stomachs (or gut contents) were examined. " + "Must be a positive integer (> 0). This is the count of predators dissected, stomach-pumped, " + "or otherwise sampled — NOT the number of prey items found. " + "Common phrasings: 'N stomachs were examined', 'a total of N individuals', 'N specimens', " + "'n=N', 'sample size of N', 'N predators were sampled'. " + "When both num_empty_stomachs and num_nonempty_stomachs are reported, " + "sample_size should equal their sum." + ), + ) + + @model_validator(mode="after") + def _reconcile_sample_size(self) -> "PredatorDietMetrics": + """Ensure sample_size == num_empty + num_nonempty when both counts are present.""" + empty = self.num_empty_stomachs + nonempty = self.num_nonempty_stomachs + if empty is not None and nonempty is not None: + calculated = empty + nonempty + if self.sample_size is None or self.sample_size != calculated: + self.sample_size = calculated + return self + + @computed_field( + description="Fraction of predators that had food in their stomachs (0.0–1.0).", + ) + @property + def fraction_feeding(self) -> Optional[float]: + if self.num_nonempty_stomachs is not None and self.sample_size is not None and self.sample_size > 0: + return round(self.num_nonempty_stomachs / self.sample_size, 4) + return None diff --git a/src/preprocessing/pdf_text_extraction.py b/src/preprocessing/pdf_text_extraction.py index a51ee2c..d3cbf12 100644 --- a/src/preprocessing/pdf_text_extraction.py +++ b/src/preprocessing/pdf_text_extraction.py @@ -156,7 +156,8 @@ def extract_text_from_pdf(pdf_path: str) -> str: img = Image.open(io.BytesIO(pix.tobytes("png"))) page_text = pytesseract.image_to_string(img) - text.append(page_text) + # Add page marker for tracking + text.append(f"[PAGE {page_num}]\n{page_text}") except Exception as e: print(f"[ERROR] Failed to extract text from {pdf_path}: {e}", file=sys.stderr) return "" @@ -177,7 +178,8 @@ def extract_text_from_pdf_bytes(data: bytes) -> str: pix = page.get_pixmap(dpi=300) img = Image.open(io.BytesIO(pix.tobytes("png"))) page_text = pytesseract.image_to_string(img) - text.append(page_text) + # Add page marker for tracking + text.append(f"[PAGE {page_num}]\n{page_text}") except Exception as e: print(f"[ERROR] Failed to extract text from PDF bytes: {e}", file=sys.stderr) return ""