Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions data/results/Adams_1989_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"source_file": "Adams_1989.txt",
"metrics": {
"species_name": "Gentoo Penguin",
"study_location": "Marion Island",
"study_date": "1984-1985",
"num_empty_stomachs": null,
"num_nonempty_stomachs": null,
"sample_size": 144,
"fraction_feeding": null,
"source_pages": [
1,
2,
6
]
}
}
16 changes: 16 additions & 0 deletions data/results/Fisher_2008_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"source_file": "Fisher_2008.txt",
"metrics": {
"species_name": "Nucella lapillus",
"study_location": "Swans Island, Maine, USA",
"study_date": "2004-2005",
"num_empty_stomachs": null,
"num_nonempty_stomachs": 15,
"sample_size": 225,
"fraction_feeding": 0.0667,
"source_pages": [
1,
2
]
}
}
39 changes: 15 additions & 24 deletions scripts/full_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def process_api_mode():
out_dir = Path("data/processed-text")
out_dir.mkdir(parents=True, exist_ok=True)
labels: Dict[str, str] = {}
count=1
count = 1
for folder_id, label in [(useful_id, "useful"), (not_useful_id, "not-useful")]:
files = list_pdfs_in_folder(service, folder_id, max_files=None)
print(f"Found {len(files)} PDFs in folder label '{label}'")
Expand All @@ -91,7 +91,7 @@ def process_api_mode():
(out_dir / txt_name).write_text(text, encoding="utf-8")
labels[txt_name] = label
print(f"{count} Processed {f['name']}")
count+=1
count += 1

write_labels(labels, Path("data/labels.json"))
print(f"Wrote {len(labels)} labeled text files.")
Expand All @@ -101,23 +101,23 @@ def process_local_mode(data_path: Path):
"""Process PDFs from local directory."""
if not data_path.exists():
raise RuntimeError(f"Data path does not exist: {data_path}")

useful_dir = data_path / "useful"
not_useful_dir = data_path / "not-useful"

if not useful_dir.exists():
raise RuntimeError(f"'useful' subfolder not found in {data_path}")
if not not_useful_dir.exists():
raise RuntimeError(f"'not-useful' subfolder not found in {data_path}")

out_dir = Path("data/processed-text")
out_dir.mkdir(parents=True, exist_ok=True)
labels: Dict[str, str] = {}

for folder, label in [(useful_dir, "useful"), (not_useful_dir, "not-useful")]:
pdf_files = list(folder.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDFs in local folder '{label}'")

for pdf_path in pdf_files:
try:
with open(pdf_path, "rb") as f:
Expand All @@ -131,7 +131,7 @@ def process_local_mode(data_path: Path):
except Exception as e:
print(f"Error processing {pdf_path.name}: {e}")
continue

write_labels(labels, Path("data/labels.json"))
print(f"Wrote {len(labels)} labeled text files.")

Expand All @@ -144,32 +144,23 @@ def main():
Examples:
API mode: python full_pipeline.py --api
Local mode: python full_pipeline.py --local ./data/pdfs
"""
""",
)

# Create mutually exclusive group for --api and --local
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"--api",
action="store_true",
help="Use API mode to download PDFs from Google Drive"
)
group.add_argument(
"--local",
type=Path,
metavar="PATH",
help="Use local mode with PDFs from specified directory (should contain 'useful' and 'not-useful' subfolders)"
)

group.add_argument("--api", action="store_true", help="Use API mode to download PDFs from Google Drive")
group.add_argument("--local", type=Path, metavar="PATH", help="Use local mode with PDFs from specified directory (should contain 'useful' and 'not-useful' subfolders)")

args = parser.parse_args()

if args.local:
print(f"Running in LOCAL mode with data path: {args.local}")
process_local_mode(args.local)
else: # args.api
print("Running in API mode (Google Drive)")
process_api_mode()

print("Beginning model training...")
run([sys.executable, "src/model/train_model.py"])
print("Training complete.")
Expand Down
190 changes: 190 additions & 0 deletions src/llm/llm_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
"""LLM-based metric extraction from scientific publications.

Usage:
python llm_client.py path/to/file.pdf
python llm_client.py path/to/file.txt
python llm_client.py path/to/file.pdf --model llama3.1:8b
python llm_client.py path/to/file.txt --output-dir results/

This script uses Ollama to extract structured data from predator diet surveys.
It can read PDFs directly (with automatic OCR for scanned pages) or preprocessed
text files. Extracted data includes species name, study date, location, and
stomach content metrics.
"""

import argparse
import json
import sys
import re
from pathlib import Path

from ollama import chat

from models import PredatorDietMetrics
from llm_text import extract_key_sections, load_document


def extract_metrics_from_text(text: str, model: str = "llama3.1:8b", num_ctx: int = 4096) -> PredatorDietMetrics:
"""Extract structured metrics from text using Ollama.

Args:
text: Preprocessed text content from a scientific publication
model: Name of the Ollama model to use
num_ctx: Context window size to request from Ollama (lower = less memory)

Returns:
PredatorDietMetrics object with extracted data
"""
prompt = f"""You are a scientific data extraction assistant. Your task is to read a predator diet survey publication and return a single flat JSON object with exactly these fields:

species_name - string or null
study_location - string or null
study_date - string or null
num_empty_stomachs - integer (>= 0) or null
num_nonempty_stomachs - integer (>= 0) or null
sample_size - integer (> 0) or null

Use null for any field whose value cannot be confidently determined from the text.

FIELD DEFINITIONS

species_name: Binomial Latin name (Genus species) of the PRIMARY PREDATOR whose diet is studied. This is the animal whose stomachs/guts were examined, not its prey. Return exactly one species. If multiple predators are studied, choose the one with the most stomach samples. Capitalize the genus, lowercase the specific epithet (e.g., "Pygoscelis papua").

study_location: Geographic area where predator specimens were collected. Include site, region, and country if available (e.g., "Marion Island, sub-Antarctic"). Check Methods, Study Area, or Study Site sections.

study_date: Year or year-range of specimen collection, NOT publication year. Format "YYYY" or "YYYY-YYYY". Look for phrases like "specimens collected in", "sampling period", "field season", "between [year] and [year]". Return null if only publication year is visible.

num_empty_stomachs: Number of predators with stomachs containing no food. Synonyms: "empty", "vacant", "without food", "zero prey items", "stomachs with no contents", "N individuals had empty stomachs".

num_nonempty_stomachs: Number of predators with stomachs containing food. Synonyms: "non-empty", "with food", "containing prey", "with contents", "fed", "N contained food", "N had prey items".

sample_size: Total number of predator individuals examined. When both num_empty_stomachs and num_nonempty_stomachs are available, sample_size equals their sum. Look for phrases like "N stomachs were examined", "a total of N individuals", "N specimens", "n=", "sample size of N".

RULES
- Do not invent data; use null if ambiguous or missing.
- Return a single JSON object; do not return arrays.
- Ignore page markers [PAGE N].
- Prioritize Abstract, Methods, and Results sections.
- Be especially careful to distinguish collection dates from publication dates.

EXAMPLES

1. Simple complete case:
{{"species_name": "Pygoscelis papua", "study_location": "Marion Island, sub-Antarctic", "study_date": "1984-1985", "num_empty_stomachs": 5, "num_nonempty_stomachs": 15, "sample_size": 20}}

2. Missing empty stomach data (can infer from sample_size):
{{"species_name": "Canis lupus", "study_location": "Yellowstone National Park, Wyoming, USA", "study_date": "2019", "num_empty_stomachs": null, "num_nonempty_stomachs": 47, "sample_size": 52}}

3. Multi-year study:
{{"species_name": "Vulpes vulpes", "study_location": "Bristol, UK", "study_date": "2015-2018", "num_empty_stomachs": 12, "num_nonempty_stomachs": 88, "sample_size": 100}}

4. Minimal data available:
{{"species_name": "Ursus arctos", "study_location": null, "study_date": "2020", "num_empty_stomachs": null, "num_nonempty_stomachs": null, "sample_size": 23}}

5. Only some fields extractable:
{{"species_name": "Zalophus californianus", "study_location": "California coast", "study_date": null, "num_empty_stomachs": 8, "num_nonempty_stomachs": 34, "sample_size": 42}}

TEXT
{text}
"""
# Ollama call with structured schema output
response = chat(
messages=[
{
'role': 'user',
'content': prompt,
}
],
model=model,
format=PredatorDietMetrics.model_json_schema(),
)

metrics = PredatorDietMetrics.model_validate_json(response.message.content)
return metrics


def main():
parser = argparse.ArgumentParser(description="Extract predator diet metrics from PDFs or text files using LLM")
parser.add_argument("input_file", type=str, help="Path to the input file (.pdf or .txt)")
parser.add_argument("--model", type=str, default="llama3.1:8b", help="Ollama model to use (default: llama3.1:8b)")
parser.add_argument("--output-dir", type=str, default="data/results", help="Output directory for JSON results (default: data/results)")
parser.add_argument("--max-chars", type=int, default=12000, help="Maximum characters of text to send to the model (default: 12000). " "Reduce if you hit CUDA/OOM errors.")
parser.add_argument("--num-ctx", type=int, default=4096, help="Context window size for the model (default: 4096). " "Lower values use less memory.")

args = parser.parse_args()

# Validate input file
input_path = Path(args.input_file)
if not input_path.exists():
print(f"[ERROR] File not found: {input_path}", file=sys.stderr)
sys.exit(1)

# Load document (PDF or text)
print(f"Processing {input_path.name}...", file=sys.stderr)
try:
text = load_document(input_path)
except Exception as e:
print(f"[ERROR] Failed to load file: {e}", file=sys.stderr)
sys.exit(1)

# Store original text for page extraction
original_text = text
print(f"[INFO] Text size: {len(text)} chars", file=sys.stderr)

# Extract key sections if text is too long
if len(text) > args.max_chars:
text = extract_key_sections(text, args.max_chars)
print(f"[INFO] Extracted key sections: {len(text)} chars (budget {args.max_chars})", file=sys.stderr)

# Extract metrics using LLM
print(f"[INFO] Extracting metrics with {args.model}...", file=sys.stderr)
try:
metrics = extract_metrics_from_text(text, model=args.model, num_ctx=args.num_ctx)
except Exception as e:
print(f"[ERROR] Extraction failed: {e}", file=sys.stderr)
sys.exit(1)

# Convert to dictionary
metrics_dict = metrics.model_dump()

# Extract page numbers programmatically from where data was found
source_pages: set[int] = set()
_skip_fields = {"fraction_feeding", "source_pages"}
for field_name, value in metrics_dict.items():
if value is not None and field_name not in _skip_fields:
value_str = str(value)
if value_str in original_text:
pos = original_text.find(value_str)
page_markers = re.findall(r'\[PAGE (\d+)\]', original_text[:pos])
if page_markers:
source_pages.add(int(page_markers[-1]))

metrics_dict["source_pages"] = sorted(source_pages) if source_pages else None

# Prepare output
result = {"source_file": input_path.name, "file_type": input_path.suffix.lower(), "metrics": metrics_dict}

# Generate output filename: input_name_results.json
output_filename = input_path.stem + "_results.json"
output_path = Path(args.output_dir) / output_filename

# Save results
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2)

print(f"[SUCCESS] Results saved to {output_path}", file=sys.stderr)

# Print summary
print("\n=== Extraction Summary ===", file=sys.stderr)
print(f"Species: {metrics_dict.get('species_name', 'N/A')}", file=sys.stderr)
print(f"Location: {metrics_dict.get('study_location', 'N/A')}", file=sys.stderr)
print(f"Date: {metrics_dict.get('study_date', 'N/A')}", file=sys.stderr)
print(f"Sample size: {metrics_dict.get('sample_size', 'N/A')}", file=sys.stderr)
print(f"Empty stomachs: {metrics_dict.get('num_empty_stomachs', 'N/A')}", file=sys.stderr)
print(f"Non-empty stomachs: {metrics_dict.get('num_nonempty_stomachs', 'N/A')}", file=sys.stderr)
print(f"Fraction feeding: {metrics_dict.get('fraction_feeding', 'N/A')}", file=sys.stderr)


if __name__ == "__main__":
main()
Loading