From 00a30989224bca83f419f4e7cf88eac9d4505940 Mon Sep 17 00:00:00 2001 From: Aarja Lohar Date: Fri, 15 Aug 2025 20:57:04 +0530 Subject: [PATCH] Toxicity Detection for Journal & News Analysis --- ml-api/README_TOXICITY.md | 299 +++++++++++++++++++++ ml-api/app/__init__.py | 66 ++++- ml-api/app/routes/perspective_routes.py | 24 ++ ml-api/app/routes/routes.py | 202 +++++++++++--- ml-api/app/services/db_service.py | 18 ++ ml-api/app/services/news_service.py | 68 +++++ ml-api/app/services/perspective_service.py | 56 ++++ ml-api/app/services/toxicity_service.py | 44 +++ ml-api/example_usage.py | 271 +++++++++++++++++++ ml-api/news.py | 15 ++ ml-api/run.py | 21 +- ml-api/setup_and_test.py | 250 +++++++++++++++++ ml-api/start_server.py | 189 +++++++++++++ 13 files changed, 1463 insertions(+), 60 deletions(-) create mode 100644 ml-api/README_TOXICITY.md create mode 100644 ml-api/app/routes/perspective_routes.py create mode 100644 ml-api/app/services/db_service.py create mode 100644 ml-api/app/services/news_service.py create mode 100644 ml-api/app/services/perspective_service.py create mode 100644 ml-api/app/services/toxicity_service.py create mode 100644 ml-api/example_usage.py create mode 100644 ml-api/news.py create mode 100644 ml-api/setup_and_test.py create mode 100644 ml-api/start_server.py diff --git a/ml-api/README_TOXICITY.md b/ml-api/README_TOXICITY.md new file mode 100644 index 00000000..cf152f74 --- /dev/null +++ b/ml-api/README_TOXICITY.md @@ -0,0 +1,299 @@ +# SentiLog ML API - Toxicity Detection + +This document describes the toxicity detection functionality that has been integrated into the SentiLog ML API system. + +## ๐ŸŽฏ Overview + +The system automatically analyzes news articles for toxicity levels using Google's Perspective API, in addition to the existing sentiment analysis. Each news article is scored for toxicity and stored with additional metadata for filtering and analysis. + +## ๐Ÿ—๏ธ Architecture + +### Services + +1. **`toxicity_service.py`** - Core toxicity detection using Perspective API +2. **`perspective_service.py`** - Extended Perspective API wrapper with configurable attributes +3. **`news_service.py`** - Enhanced to include toxicity scoring during news fetching +4. **`db_service.py`** - MongoDB integration for storing toxicity data + +### Data Flow + +``` +News API โ†’ fetch_news() โ†’ Sentiment Analysis (VADER) โ†’ Toxicity Analysis (Perspective) โ†’ MongoDB Storage +``` + +## ๐Ÿ”ง Configuration + +### Required Environment Variables + +```bash +# News API (required) +NEWS_API_KEY=your_news_api_key_here + +# Google Perspective API (required) +PERSPECTIVE_API_KEY=your_perspective_api_key_here + +# Optional Configuration +TOXICITY_THRESHOLD=0.7 # Threshold for marking content as toxic +PERSPECTIVE_TIMEOUT=6 # API request timeout in seconds +MONGO_URI=mongodb://localhost:27017 # MongoDB connection string +MONGO_DB=sentilog # Database name +NEWS_COLLECTION=news # Collection name +``` + +### Getting API Keys + +1. **News API Key**: + - Visit [NewsAPI.org](https://newsapi.org/) + - Sign up for a free account + - Copy your API key + +2. **Perspective API Key**: + - Visit [Google Cloud Console](https://console.cloud.google.com/) + - Enable the Perspective Comment Analyzer API + - Create credentials (API key) + - Copy your API key + +## ๐Ÿ“Š Data Structure + +### News Article Schema + +```json +{ + "_id": "ObjectId(...)", + "title": "Article title", + "description": "Article description", + "content": "Full article content", + "url": "https://example.com/article", + "source": "Source name", + "publishedAt": "2025-01-15T10:30:00Z", + "sentiment": "Positive", + "toxicity": { + "score": 0.15, + "toxic": false, + "threshold": 0.7 + }, + "ingested_at": 1705320600 +} +``` + +### Toxicity Object Fields + +- **`score`** (float): Toxicity score from 0.0 (not toxic) to 1.0 (very toxic) +- **`toxic`** (boolean): Whether the content exceeds the toxicity threshold +- **`threshold`** (float): The threshold used for classification + +## ๐Ÿš€ API Endpoints + +### 1. Fetch and Store News with Toxicity Analysis + +```http +POST /ml-api/fetch-and-store +Content-Type: application/json + +{ + "query": "technology", + "page_size": 20, + "max_pages": 2, + "from_date": "2025-01-01" +} +``` + +**Response:** +```json +{ + "status": "done", + "inserted": 35, + "updated": 5, + "errors": [] +} +``` + +### 2. Get News with Toxicity Filtering + +```http +GET /ml-api/news?limit=10&toxic=true +``` + +**Query Parameters:** +- `limit` (int): Number of articles to return (default: 20) +- `toxic` (string): Filter by toxicity level ("true" for toxic, "false" for non-toxic) + +**Response:** +```json +{ + "count": 10, + "results": [ + { + "_id": "...", + "title": "Example Article", + "toxicity": { + "score": 0.85, + "toxic": true, + "threshold": 0.7 + }, + ... + } + ] +} +``` + +### 3. Standalone Toxicity Analysis + +```http +POST /ml-api/analyze-toxicity +Content-Type: application/json + +{ + "text": "Text to analyze for toxicity" +} +``` + +**Response:** +```json +{ + "score": 0.25, + "toxic": false, + "threshold": 0.7 +} +``` + +## ๐Ÿงช Testing + +### Run the Test Suite + +```bash +python setup_and_test.py +``` + +This will test: +- Dependencies installation +- Environment variables +- Database connection +- Toxicity detection services +- News fetching pipeline +- API endpoints (if server is running) + +### Manual Testing + +1. **Start the server:** + ```bash + python run.py + ``` + +2. **Test toxicity analysis:** + ```bash + curl -X POST http://localhost:5001/ml-api/analyze-toxicity \ + -H "Content-Type: application/json" \ + -d '{"text": "This is a test message"}' + ``` + +3. **Fetch and analyze news:** + ```bash + curl -X POST http://localhost:5001/ml-api/fetch-and-store \ + -H "Content-Type: application/json" \ + -d '{"query": "technology", "page_size": 5}' + ``` + +4. **View results:** + ```bash + curl http://localhost:5001/ml-api/news?limit=5 + ``` + +## ๐Ÿ” Understanding Toxicity Scores + +### Score Interpretation + +- **0.0 - 0.2**: Very low toxicity (safe content) +- **0.2 - 0.4**: Low toxicity (mostly safe) +- **0.4 - 0.6**: Moderate toxicity (review recommended) +- **0.6 - 0.8**: High toxicity (likely problematic) +- **0.8 - 1.0**: Very high toxicity (definitely problematic) + +### Default Threshold + +The default threshold is set to **0.7**, meaning content with a score โ‰ฅ 0.7 is marked as toxic. + +## ๐Ÿšจ Error Handling + +The system handles various error scenarios: + +1. **API Rate Limits**: Includes sleep delays between requests +2. **Network Errors**: Graceful fallback with error logging +3. **Invalid Content**: Empty or null text returns score 0.0 +4. **API Key Issues**: Clear error messages for missing credentials + +### Error Response Example + +```json +{ + "toxicity": { + "error": "Perspective API request failed: 403 Forbidden" + } +} +``` + +## ๐Ÿ“ˆ Performance Considerations + +1. **Rate Limiting**: The Perspective API has usage limits +2. **Caching**: Consider implementing caching for repeated content +3. **Batch Processing**: Process articles in batches to manage API quotas +4. **Timeout Settings**: Configured timeouts prevent hanging requests + +## ๐Ÿ”ง Customization + +### Adding More Toxicity Attributes + +The Perspective API supports multiple attributes. To analyze additional ones: + +1. Modify `perspective_service.py`: + ```python + attributes = ["TOXICITY", "SEVERE_TOXICITY", "IDENTITY_ATTACK", "INSULT"] + ``` + +2. Update the database schema to store additional scores + +### Adjusting Thresholds + +Modify the `TOXICITY_THRESHOLD` environment variable or update the service logic for dynamic thresholds. + +## ๐Ÿ› Troubleshooting + +### Common Issues + +1. **"PERSPECTIVE_API_KEY not set"** + - Ensure the API key is set in your `.env` file + - Verify the Perspective API is enabled in Google Cloud Console + +2. **High toxicity scores on normal content** + - Check the content being analyzed + - Consider adjusting the threshold + - Verify the API is working correctly + +3. **Database connection errors** + - Ensure MongoDB is running + - Check the `MONGO_URI` configuration + - Verify database permissions + +4. **News API quota exceeded** + - Check your NewsAPI usage limits + - Consider using a paid plan for higher limits + - Implement request throttling + +## ๐Ÿ”„ Future Enhancements + +Potential improvements to consider: + +1. **Multi-language Support**: Extend toxicity analysis to other languages +2. **Historical Analysis**: Track toxicity trends over time +3. **Content Moderation**: Automatically flag or filter toxic content +4. **Dashboard**: Web interface for toxicity analytics +5. **Machine Learning**: Train custom models for domain-specific toxicity + +## ๐Ÿ“ž Support + +For issues with the toxicity detection functionality: + +1. Run the test suite: `python setup_and_test.py` +2. Check the error logs in the console +3. Verify API keys and environment configuration +4. Review the Google Perspective API documentation diff --git a/ml-api/app/__init__.py b/ml-api/app/__init__.py index e6900c03..a6ff9e83 100644 --- a/ml-api/app/__init__.py +++ b/ml-api/app/__init__.py @@ -1,22 +1,64 @@ -from flask import Flask +from flask import Flask, jsonify from flask_cors import CORS from dotenv import load_dotenv +from pymongo import MongoClient import os + load_dotenv() -def create_app(): - """Simple Flask app factory without config classes""" + +def create_app(config_object=None): + """Application factory that configures Flask app and routes.""" app = Flask(__name__) - # Set config directly - app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', 'dev-secret-key-change-in-production') - app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size - cors_origins = os.environ.get('CORS_ORIGINS', '*').split(',') + app.config['SECRET_KEY'] = os.environ.get( + 'SECRET_KEY', + 'dev-secret-key-change-in-production' + ) + app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 - # Initialize CORS + cors_origins = os.environ.get('CORS_ORIGINS', '*').split(',') CORS(app, resources={r"/*": {"origins": cors_origins}}) - # Register blueprints - from app.routes import routes - app.register_blueprint(routes.bp) + if config_object: + app.config.from_object(config_object) + + from app.routes import routes as vader_routes + app.register_blueprint(vader_routes.bp) + + try: + from app.routes import perspective_routes + app.register_blueprint(perspective_routes.bp) + except ImportError: + + pass + + + mongo_client = MongoClient("mongodb://localhost:27017/") + db = mongo_client["news_db"] + collection = db["articles"] + + @app.route("/news", methods=["GET"]) + def get_news(): + articles = list(collection.find().sort("_id", -1).limit(20)) + for article in articles: + article["_id"] = str(article["_id"]) + return jsonify(articles) + @app.route('/') + def root(): + return {'message': 'SentiLog ML API is running โ€” visit /ml-api/'}, 200 + + @app.errorhandler(400) + @app.errorhandler(404) + @app.errorhandler(500) + @app.errorhandler(502) + def custom_error_handler(error): + response = { + 'error': { + 'code': error.code, + 'name': error.name, + 'description': error.description, + } + } + return jsonify(response), error.code - return app \ No newline at end of file + return app diff --git a/ml-api/app/routes/perspective_routes.py b/ml-api/app/routes/perspective_routes.py new file mode 100644 index 00000000..eb45ef20 --- /dev/null +++ b/ml-api/app/routes/perspective_routes.py @@ -0,0 +1,24 @@ +from flask import Blueprint, request, jsonify +from app.services.perspective_service import analyze_toxicity_text + +bp = Blueprint('perspective', __name__, url_prefix='/ml-api') + +@bp.route('/analyze-toxicity', methods=['POST']) +def analyze(): + data = request.get_json(silent=True) + if not data or 'text' not in data: + return jsonify({'error': 'text is required'}), 400 + + try: + result = analyze_toxicity_text(data['text']) + return jsonify({ + 'score': result['score'], + 'toxic': result['toxic'], + 'threshold': result['threshold'] + }), 200 + except ValueError as ve: + return jsonify({'error': str(ve)}), 400 + except RuntimeError as re: + return jsonify({'error': str(re)}), 502 + except Exception as e: + return jsonify({'error': f'Unexpected error: {e}'}), 500 diff --git a/ml-api/app/routes/routes.py b/ml-api/app/routes/routes.py index a4cc2737..80d2608b 100644 --- a/ml-api/app/routes/routes.py +++ b/ml-api/app/routes/routes.py @@ -1,52 +1,190 @@ +# ml-api/app/routes/routes.py from flask import Blueprint, request, jsonify from app.services.vader_service import VaderService +from app.services.perspective_service import analyze_toxicity_text +from app.services.news_service import fetch_news +from app.services.db_service import get_news_collection +from pymongo.errors import DuplicateKeyError +import time -# Create blueprint bp = Blueprint('api', __name__, url_prefix='/ml-api') -# Initialize VADER service vader_service = VaderService() + @bp.route('/', methods=['GET']) def index(): return jsonify({'message': 'SentiLog ML API is running'}) +@bp.route('/health', methods=['GET']) +def health_check(): + """Comprehensive health check endpoint""" + import os + from app.services.db_service import get_db + + health_status = { + 'status': 'healthy', + 'timestamp': int(time.time()), + 'services': {}, + 'environment': {} + } + + + required_env_vars = ['NEWS_API_KEY', 'PERSPECTIVE_API_KEY'] + for var in required_env_vars: + health_status['environment'][var] = 'set' if os.getenv(var) else 'missing' + + try: + db = get_db() + server_info = db.client.server_info() + health_status['services']['database'] = 'healthy' + health_status['services']['mongodb_version'] = server_info.get('version') + except Exception as e: + health_status['services']['database'] = f'error: {str(e)}' + health_status['status'] = 'degraded' + + + try: + if os.getenv('NEWS_API_KEY'): + health_status['services']['news_api'] = 'configured' + else: + health_status['services']['news_api'] = 'not configured' + except Exception: + health_status['services']['news_api'] = 'error' + + + try: + if os.getenv('PERSPECTIVE_API_KEY'): + health_status['services']['perspective_api'] = 'configured' + else: + health_status['services']['perspective_api'] = 'not configured' + except Exception: + health_status['services']['perspective_api'] = 'error' + + status_code = 200 if health_status['status'] == 'healthy' else 503 + return jsonify(health_status), status_code + @bp.route('/predict', methods=['POST']) def predict(): - """Main sentiment prediction endpoint""" try: - data = request.get_json() - if not data: - return jsonify({'error': 'No JSON data provided'}), 400 - - text = data.get('text', '') - if not text or not text.strip(): - return jsonify({'error': 'No text provided'}), 400 - - # Use VADER service for sentiment analysis + data = request.get_json(silent=True) + if not data or 'text' not in data: + return jsonify({'error': 'text is required'}), 400 + text = data['text'] sentiment = vader_service.analyze(text) - - return jsonify({ - 'sentiment': sentiment, - 'text_length': len(text) - }) - - except Exception as e: + return jsonify({'sentiment': sentiment, 'text_length': len(text)}), 200 + except Exception: return jsonify({'error': 'Internal server error'}), 500 @bp.route('/vader/analyze', methods=['POST']) def vader_analyze(): - """VADER sentiment analysis endpoint""" try: - data = request.get_json() - if not data: - return jsonify({'error': 'No JSON data provided'}), 400 - - text = data.get('text', '') - if not text or not text.strip(): - return jsonify({'error': 'No text provided'}), 400 - - result = vader_service.analyze(text) - return jsonify({'sentiment': result}) - + data = request.get_json(silent=True) + if not data or 'text' not in data: + return jsonify({'error': 'text is required'}), 400 + text = data['text'] + sentiment = vader_service.analyze(text) + return jsonify({'sentiment': sentiment}), 200 + except Exception: + return jsonify({'error': 'Internal server error'}), 500 + + + +@bp.route('/fetch-and-store', methods=['POST']) +def fetch_and_store(): + """ + POST body JSON: + { + "query": "bitcoin", + "page_size": 20, + "max_pages": 2, + "from_date": "2025-08-01" + } + The endpoint will fetch news, run sentiment & toxicity, and store each doc in MongoDB + with a new `toxicity` field. + """ + try: + payload = request.get_json(silent=True) or {} + query = payload.get("query", "technology") + page_size = int(payload.get("page_size", 20)) + max_pages = int(payload.get("max_pages", 1)) + from_date = payload.get("from_date") + + col = get_news_collection() + inserted = 0 + updated = 0 + errors = [] + + for page in range(1, max_pages + 1): + articles = fetch_news(query=query, page_size=page_size, page=page, from_date=from_date) + if not articles: + break + for art in articles: + try: + text_for_analysis = (art.get("content") or "") or (art.get("description") or "") or (art.get("title") or "") + + sentiment = vader_service.analyze(text_for_analysis) + try: + tox = analyze_toxicity_text(text_for_analysis) + toxicity_obj = { + "score": tox["score"], + "toxic": tox["toxic"], + "threshold": tox["threshold"] + } + except Exception as e: + toxicity_obj = {"error": str(e)} + + doc = { + "title": art.get("title"), + "description": art.get("description"), + "content": art.get("content"), + "url": art.get("url"), + "source": art.get("source"), + "publishedAt": art.get("publishedAt"), + "sentiment": sentiment, + "toxicity": toxicity_obj, + "ingested_at": int(time.time()) + } + res = col.update_one({"url": doc["url"]}, {"$set": doc}, upsert=True) + if res.upserted_id: + inserted += 1 + else: + updated += 1 + + except Exception as e: + errors.append({"title": art.get("title"), "error": str(e)}) + + return jsonify({ + "status": "done", + "inserted": inserted, + "updated": updated, + "errors": errors + }), 200 + + except Exception as e: + return jsonify({"error": str(e)}), 500 + +@bp.route('/news', methods=['GET']) +def list_news(): + """ + Query params: + - limit (default 20) + - toxic (optional: true/false) + """ + try: + col = get_news_collection() + limit = int(request.args.get("limit", 20)) + toxic = request.args.get("toxic") + query = {} + if toxic is not None: + if toxic.lower() in ("1", "true", "yes"): + query["toxicity.toxic"] = True + else: + query["toxicity.toxic"] = False + + docs = list(col.find(query).sort("ingested_at", -1).limit(limit)) + + for d in docs: + d["_id"] = str(d["_id"]) + return jsonify({"count": len(docs), "results": docs}), 200 except Exception as e: - return jsonify({'error': 'Internal server error'}), 500 \ No newline at end of file + return jsonify({"error": str(e)}), 500 diff --git a/ml-api/app/services/db_service.py b/ml-api/app/services/db_service.py new file mode 100644 index 00000000..1b968136 --- /dev/null +++ b/ml-api/app/services/db_service.py @@ -0,0 +1,18 @@ +import os +from pymongo import MongoClient + +MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost:27017") +MONGO_DB = os.getenv("MONGO_DB", "sentilog") +NEWS_COLLECTION = os.getenv("NEWS_COLLECTION", "news") + +_client = None + +def get_db(): + global _client + if _client is None: + _client = MongoClient(MONGO_URI) + return _client[MONGO_DB] + +def get_news_collection(): + db = get_db() + return db[NEWS_COLLECTION] diff --git a/ml-api/app/services/news_service.py b/ml-api/app/services/news_service.py new file mode 100644 index 00000000..48e2351e --- /dev/null +++ b/ml-api/app/services/news_service.py @@ -0,0 +1,68 @@ +import os +import requests +import logging +from time import sleep +from .perspective_service import analyze_toxicity_text + +NEWS_API_KEY = os.getenv("NEWS_API_KEY") +NEWS_API_URL = os.getenv("NEWS_API_URL", "https://newsapi.org/v2/everything") +DEFAULT_PAGE_SIZE = int(os.getenv("NEWS_PAGE_SIZE", "20")) +REQUEST_TIMEOUT = int(os.getenv("NEWS_TIMEOUT", "8")) +RATE_SLEEP = float(os.getenv("NEWS_RATE_SLEEP", "0.2")) + +def fetch_news(query="technology", page_size=DEFAULT_PAGE_SIZE, page=1, from_date=None): + if not NEWS_API_KEY: + raise RuntimeError("NEWS_API_KEY not set in env") + + params = { + "q": query, + "pageSize": page_size, + "page": page, + "apiKey": NEWS_API_KEY, + "language": "en", + "sortBy": "publishedAt" + } + if from_date: + params["from"] = from_date + + resp = requests.get(NEWS_API_URL, params=params, timeout=REQUEST_TIMEOUT) + resp.raise_for_status() + data = resp.json() + articles = data.get("articles", []) + + normalized = [] + for a in articles: + if not a.get("url") or not a.get("title"): + continue + + text_for_toxicity = " ".join(filter(None, [ + a.get("title"), a.get("description"), a.get("content") + ])) + try: + toxicity_result = analyze_toxicity_text(text_for_toxicity) + toxicity_obj = { + "score": toxicity_result["score"], + "toxic": toxicity_result["toxic"], + "threshold": toxicity_result["threshold"] + } + except Exception as e: + logging.warning(f"Toxicity analysis failed for article: {e}") + toxicity_obj = { + "score": 0.0, + "toxic": False, + "threshold": 0.7, + "error": str(e) + } + + normalized.append({ + "title": a.get("title"), + "description": a.get("description"), + "content": a.get("content") or "", + "url": a.get("url"), + "source": a.get("source", {}).get("name"), + "publishedAt": a.get("publishedAt"), + "toxicity": toxicity_obj + }) + sleep(RATE_SLEEP) + + return normalized diff --git a/ml-api/app/services/perspective_service.py b/ml-api/app/services/perspective_service.py new file mode 100644 index 00000000..dea1b8ea --- /dev/null +++ b/ml-api/app/services/perspective_service.py @@ -0,0 +1,56 @@ +import os +import requests + +PERSPECTIVE_API_KEY = os.getenv("PERSPECTIVE_API_KEY") +PERSPECTIVE_API_URL = os.getenv( + "PERSPECTIVE_API_URL", + "https://commentanalyzer.googleapis.com/v1/comments:analyze" +) +TOXICITY_THRESHOLD = float(os.getenv("TOXICITY_THRESHOLD", "0.7")) +REQUEST_TIMEOUT = int(os.getenv("PERSPECTIVE_TIMEOUT", "6")) + +def analyze_toxicity_text(text, attributes=None): + """Return {'score': float, 'toxic': bool, 'threshold': float, 'raw': {...}}""" + if not text or not isinstance(text, str): + raise ValueError("text must be a non-empty string") + + if attributes is None: + attributes = ["TOXICITY"] + + if not PERSPECTIVE_API_KEY: + raise RuntimeError("PERSPECTIVE_API_KEY not set in env") + + requestedAttributes = {a: {} for a in attributes} + payload = { + "comment": {"text": text}, + "languages": ["en"], + "requestedAttributes": requestedAttributes + } + + try: + res = requests.post( + f"{PERSPECTIVE_API_URL}?key={PERSPECTIVE_API_KEY}", + json=payload, + timeout=REQUEST_TIMEOUT + ) + res.raise_for_status() + except requests.RequestException as e: + raise RuntimeError(f"Perspective API request failed: {e}") + + data = res.json() + tox_value = ( + data.get("attributeScores", {}) + .get("TOXICITY", {}) + .get("summaryScore", {}) + .get("value") + ) + + if tox_value is None: + raise RuntimeError("Perspective API returned no TOXICITY score") + + return { + "score": float(tox_value), + "toxic": float(tox_value) >= TOXICITY_THRESHOLD, + "threshold": TOXICITY_THRESHOLD, + "raw": data + } diff --git a/ml-api/app/services/toxicity_service.py b/ml-api/app/services/toxicity_service.py new file mode 100644 index 00000000..64db99a5 --- /dev/null +++ b/ml-api/app/services/toxicity_service.py @@ -0,0 +1,44 @@ +import os +import requests + +PERSPECTIVE_API_KEY = os.getenv("PERSPECTIVE_API_KEY") +PERSPECTIVE_API_URL = os.getenv( + "PERSPECTIVE_API_URL", + "https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze" +) +REQUEST_TIMEOUT = int(os.getenv("PERSPECTIVE_TIMEOUT", "8")) + +def get_toxicity_score(text: str) -> float: + """ + Sends text to Perspective API and returns the toxicity score (0.0 to 1.0). + Returns 0.0 if API key not set or an error occurs. + """ + if not PERSPECTIVE_API_KEY: + raise RuntimeError("PERSPECTIVE_API_KEY not set in env") + + if not text.strip(): + return 0.0 + + payload = { + "comment": {"text": text}, + "languages": ["en"], + "requestedAttributes": {"TOXICITY": {}} + } + + try: + response = requests.post( + f"{PERSPECTIVE_API_URL}?key={PERSPECTIVE_API_KEY}", + json=payload, + timeout=REQUEST_TIMEOUT + ) + response.raise_for_status() + data = response.json() + + return data.get("attributeScores", {}) \ + .get("TOXICITY", {}) \ + .get("summaryScore", {}) \ + .get("value", 0.0) + + except Exception as e: + print(f"[toxicity_service] Error getting toxicity score: {e}") + return 0.0 diff --git a/ml-api/example_usage.py b/ml-api/example_usage.py new file mode 100644 index 00000000..a3408016 --- /dev/null +++ b/ml-api/example_usage.py @@ -0,0 +1,271 @@ +""" +Example Usage Script for SentiLog ML API Toxicity Detection + +This script demonstrates how to use the toxicity detection functionality +in various scenarios. + +Usage: + python example_usage.py +""" + +import os +import sys +import time +import requests +from dotenv import load_dotenv +load_dotenv() +BASE_URL = "http://localhost:5001/ml-api" +SERVER_TIMEOUT = 5 + +def test_server_connection(): + """Test if the server is running and accessible.""" + try: + response = requests.get(f"{BASE_URL}/", timeout=SERVER_TIMEOUT) + return response.status_code == 200 + except: + return False + +def analyze_single_text(): + """Demonstrate single text toxicity analysis.""" + print("๐Ÿงช Testing Single Text Analysis") + print("-" * 40) + + test_texts = [ + "This is a wonderful day with beautiful weather.", + "Technology is advancing rapidly these days.", + "That was a terrible and awful decision!", + "I hate this stupid system, it never works right!", + "The new iPhone features look innovative and exciting." + ] + + for i, text in enumerate(test_texts, 1): + print(f"\nTest {i}: {text[:50]}...") + + try: + response = requests.post( + f"{BASE_URL}/analyze-toxicity", + json={"text": text}, + timeout=SERVER_TIMEOUT + ) + + if response.status_code == 200: + result = response.json() + score = result.get('score', 0) + toxic = result.get('toxic', False) + + status = "TOXIC" if toxic else "SAFE" + print(f" Result: {status} (Score: {score:.3f})") + else: + print(f"Error: {response.status_code}") + + except Exception as e: + print(f" Request failed: {e}") + +def fetch_and_analyze_news(): + """Demonstrate news fetching with toxicity analysis.""" + print("\n\n๐Ÿ“ฐ Testing News Fetching and Analysis") + print("-" * 40) + payload = { + "query": "technology", + "page_size": 5, + "max_pages": 1 + } + + print("Fetching news articles...") + + try: + response = requests.post( + f"{BASE_URL}/fetch-and-store", + json=payload, + timeout=60 + ) + + if response.status_code == 200: + result = response.json() + print(f"Success!") + print(f" Inserted: {result.get('inserted', 0)} articles") + print(f" Updated: {result.get('updated', 0)} articles") + print(f" Errors: {len(result.get('errors', []))}") + + if result.get('errors'): + print(" Error details:") + for error in result['errors'][:3]: + print(f" - {error}") + + else: + print(f"Failed: {response.status_code}") + print(f"Response: {response.text}") + + except Exception as e: + print(f"Request failed: {e}") + +def view_stored_news(): + """View and analyze stored news articles.""" + print("\n\nViewing Stored News") + print("-" * 40) + + try: + # Get all articles + response = requests.get(f"{BASE_URL}/news?limit=10", timeout=SERVER_TIMEOUT) + + if response.status_code == 200: + data = response.json() + articles = data.get('results', []) + + print(f"Found {len(articles)} articles:") + + for i, article in enumerate(articles, 1): + title = article.get('title', 'No title')[:60] + toxicity = article.get('toxicity', {}) + sentiment = article.get('sentiment', 'Unknown') + + if isinstance(toxicity, dict): + score = toxicity.get('score', 0) + toxic = toxicity.get('toxic', False) + status = "๐Ÿ”ด" if toxic else "๐ŸŸข" + else: + score = 0 + status = "โšช" + + print(f"\n{i}. {title}...") + print(f" Sentiment: {sentiment}") + print(f" Toxicity: {status} {score:.3f}") + + else: + print(f"Failed to fetch articles: {response.status_code}") + + except Exception as e: + print(f"Request failed: {e}") + +def filter_toxic_content(): + """Demonstrate filtering content by toxicity level.""" + print("\n\nFiltering Toxic Content") + print("-" * 40) + + try: + # Get only toxic articles + response = requests.get(f"{BASE_URL}/news?limit=5&toxic=true", timeout=SERVER_TIMEOUT) + + if response.status_code == 200: + data = response.json() + articles = data.get('results', []) + + print(f"Found {len(articles)} toxic articles:") + + for i, article in enumerate(articles, 1): + title = article.get('title', 'No title')[:60] + toxicity = article.get('toxicity', {}) + + if isinstance(toxicity, dict): + score = toxicity.get('score', 0) + threshold = toxicity.get('threshold', 0.7) + print(f"\n{i}. {title}...") + print(f" Toxicity Score: {score:.3f} (threshold: {threshold})") + else: + print(f"\n{i}. {title}...") + print(f" Toxicity: Error or not analyzed") + + else: + print(f"Failed to fetch toxic articles: {response.status_code}") + + except Exception as e: + print(f"Request failed: {e}") + +def display_statistics(): + """Display toxicity statistics from stored articles.""" + print("\n\nToxicity Statistics:") + print("-" * 40) + + try: + response = requests.get(f"{BASE_URL}/news?limit=100", timeout=SERVER_TIMEOUT) + + if response.status_code == 200: + data = response.json() + articles = data.get('results', []) + + if not articles: + print("No articles found for analysis") + return + + scores = [] + toxic_count = 0 + error_count = 0 + + for article in articles: + toxicity = article.get('toxicity', {}) + if isinstance(toxicity, dict): + if 'score' in toxicity: + scores.append(toxicity['score']) + if toxicity.get('toxic', False): + toxic_count += 1 + else: + error_count += 1 + else: + error_count += 1 + + if scores: + avg_score = sum(scores) / len(scores) + min_score = min(scores) + max_score = max(scores) + + print(f"Total articles analyzed: {len(articles)}") + print(f"Successfully scored: {len(scores)}") + print(f"Errors/Missing scores: {error_count}") + print(f"Marked as toxic: {toxic_count}") + print(f"Average toxicity score: {avg_score:.3f}") + print(f"Score range: {min_score:.3f} - {max_score:.3f}") + + low = sum(1 for s in scores if s < 0.3) + medium = sum(1 for s in scores if 0.3 <= s < 0.7) + high = sum(1 for s in scores if s >= 0.7) + + print(f"\nDistribution:") + print(f" Low (0.0-0.3): {low} articles ({low/len(scores)*100:.1f}%)") + print(f" Medium (0.3-0.7): {medium} articles ({medium/len(scores)*100:.1f}%)") + print(f" High (0.7-1.0): {high} articles ({high/len(scores)*100:.1f}%)") + else: + print("No toxicity scores found") + + else: + print(f"Failed to fetch articles: {response.status_code}") + + except Exception as e: + print(f"Analysis failed: {e}") + +def main(): + """Run the complete demonstration.""" + print("SentiLog ML API - Toxicity Detection Demo") + print("=" * 50) + + # Check if server is running + print("Checking server connection...") + if not test_server_connection(): + print("Server not accessible at", BASE_URL) + print("Please start the server with: python run.py") + sys.exit(1) + + print("โœ… Server is running") + + # Run demonstrations + try: + analyze_single_text() + fetch_and_analyze_news() + time.sleep(2) + view_stored_news() + filter_toxic_content() + display_statistics() + + print("\n" + "=" * 50) + print("Demo completed successfully!") + print("\nNext steps:") + print("- Integrate toxicity detection into your application") + print("- Set up monitoring for toxic content") + print("- Customize thresholds based on your requirements") + + except KeyboardInterrupt: + print("\n\nDemo interrupted by user") + except Exception as e: + print(f"\nDemo failed with error: {e}") + +if __name__ == "__main__": + main() diff --git a/ml-api/news.py b/ml-api/news.py new file mode 100644 index 00000000..e034f946 --- /dev/null +++ b/ml-api/news.py @@ -0,0 +1,15 @@ +from sqlalchemy import Column, String, Text, DateTime, Float, Integer +from .database import Base + +class News(Base): + __tablename__ = "news" + + id = Column(Integer, primary_key=True, index=True) + title = Column(String) + description = Column(Text) + content = Column(Text) + url = Column(String) + source = Column(String) + publishedAt = Column(DateTime) + sentiment = Column(Float) + toxicity = Column(Float) diff --git a/ml-api/run.py b/ml-api/run.py index cfcd7858..2f8fc4e6 100644 --- a/ml-api/run.py +++ b/ml-api/run.py @@ -1,24 +1,13 @@ import os from app import create_app -# Create Flask app instance +# Create the Flask app from your existing factory app = create_app() if __name__ == '__main__': - # Get port from environment variable or default to 5001 - port = os.environ.get('PORT') - - # Get host from environment variable or default to localhost + port = int(os.environ.get('PORT', 5000)) host = os.environ.get('HOST', '0.0.0.0') - - # Get debug mode from environment variable debug = os.environ.get('FLASK_ENV') == 'development' - - print(f"Starting sentiment analysis API on {host}:{port}") - print(f"Debug mode: {debug}") - - app.run( - host=host, - port=port, - debug=debug - ) \ No newline at end of file + + print(f"๐Ÿš€ Starting SentiLog ML API on {host}:{port} โ€” debug={debug}") + app.run(host=host, port=port, debug=debug) diff --git a/ml-api/setup_and_test.py b/ml-api/setup_and_test.py new file mode 100644 index 00000000..fd08e976 --- /dev/null +++ b/ml-api/setup_and_test.py @@ -0,0 +1,250 @@ +""" +Setup and Testing Script for SentiLog ML API with Toxicity Detection + +This script helps you: +1. Verify all dependencies are installed +2. Check environment variables +3. Test the toxicity detection functionality +4. Test the complete news pipeline + +Usage: + python setup_and_test.py +""" + +import os +import sys +import json +import requests +from dotenv import load_dotenv +load_dotenv() + +def check_dependencies(): + """Check if all required dependencies are installed.""" + print("๐Ÿ” Checking dependencies...") + + required_packages = [ + 'flask', + 'flask_cors', + 'pymongo', + 'requests', + 'vaderSentiment', + 'dotenv' + ] + + missing = [] + for package in required_packages: + try: + __import__(package.replace('-', '_')) + print(f"{package}") + except ImportError: + print(f"{package}") + missing.append(package) + + if missing: + print(f"\n Missing packages: {', '.join(missing)}") + print("Run: pip install -r requirements.txt") + return False + + print("All dependencies are installed!") + return True + +def check_environment_variables(): + """Check if required environment variables are set.""" + print("\n๐Ÿ” Checking environment variables...") + + required_vars = [ + 'NEWS_API_KEY', + 'PERSPECTIVE_API_KEY' + ] + + optional_vars = { + 'MONGO_URI': 'mongodb://localhost:27017', + 'MONGO_DB': 'sentilog', + 'TOXICITY_THRESHOLD': '0.7' + } + + missing = [] + for var in required_vars: + if os.getenv(var): + print(f" {var}: {'*' * 8}{os.getenv(var)[-4:]}") + else: + print(f" {var}: Not set") + missing.append(var) + + for var, default in optional_vars.items(): + value = os.getenv(var, default) + print(f" {var}: {value}") + + if missing: + print(f"\n Missing required variables: {', '.join(missing)}") + print("Please set them in your .env file") + return False + + print(" Environment variables are configured!") + return True + +def test_toxicity_service(): + """Test the toxicity detection service directly.""" + print("\nTesting toxicity detection service...") + + try: + from app.services.toxicity_service import get_toxicity_score + + # Test with neutral text + neutral_text = "This is a normal news article about technology." + neutral_score = get_toxicity_score(neutral_text) + print(f" Neutral text score: {neutral_score}") + toxic_text = "This is terrible and awful and stupid!" + toxic_score = get_toxicity_score(toxic_text) + print(f" Potentially toxic text score: {toxic_score}") + + print("Toxicity service is working!") + return True + + except Exception as e: + print(f"Toxicity service error: {e}") + return False + +def test_perspective_service(): + """Test the perspective API service.""" + print("\nTesting Perspective API service...") + + try: + from app.services.perspective_service import analyze_toxicity_text + + test_text = "This is a test message for toxicity analysis." + result = analyze_toxicity_text(test_text) + + print(f" Score: {result['score']:.3f}") + print(f" Toxic: {result['toxic']}") + print(f" Threshold: {result['threshold']}") + + print("Perspective service is working!") + return True + + except Exception as e: + print(f"Perspective service error: {e}") + return False + +def test_database_connection(): + """Test MongoDB connection.""" + print("\nTesting database connection...") + + try: + from app.services.db_service import get_db, get_news_collection + + # Test database connection + db = get_db() + collection = get_news_collection() + + # Try to get collection stats + stats = db.command("collstats", collection.name) + print(f" Database: {db.name}") + print(f" Collection: {collection.name}") + print(f" Document count: {stats.get('count', 0)}") + + print("Database connection is working!") + return True + + except Exception as e: + print(f"Database connection error: {e}") + return False + +def test_news_service(): + """Test the news fetching service.""" + print("\nTesting news service...") + + try: + from app.services.news_service import fetch_news + articles = fetch_news(query="technology", page_size=2, page=1) + + if articles: + print(f" Fetched {len(articles)} articles") + for i, article in enumerate(articles[:2], 1): + print(f" Article {i}:") + print(f" Title: {article.get('title', 'No title')[:50]}...") + print(f" Toxicity: {article.get('toxicity', 'No score')}") + else: + print(" No articles fetched") + + print("News service is working!") + return True + + except Exception as e: + print(f"News service error: {e}") + return False + +def test_api_endpoints(): + """Test API endpoints if the server is running.""" + print("\n๐Ÿงช Testing API endpoints (assuming server is running on localhost:5001)...") + + base_url = "http://localhost:5001/ml-api" + + try: + # Test root endpoint + response = requests.get(f"{base_url}/", timeout=5) + if response.status_code == 200: + print("Root endpoint working") + else: + print(f"Root endpoint returned {response.status_code}") + + # Test toxicity analysis endpoint + test_data = {"text": "This is a test message for API testing."} + response = requests.post(f"{base_url}/analyze-toxicity", json=test_data, timeout=5) + if response.status_code == 200: + result = response.json() + print(f"Toxicity endpoint working (score: {result.get('score', 'N/A')})") + else: + print(f"Toxicity endpoint returned {response.status_code}") + + except requests.exceptions.ConnectionError: + print("Server not running - skipping API tests") + print(" Start the server with: python run.py") + except Exception as e: + print(f"API test error: {e}") + +def main(): + """Run all tests and checks.""" + print("SentiLog ML API - Setup and Testing") + print("=" * 50) + + checks = [ + check_dependencies, + check_environment_variables, + test_database_connection, + test_toxicity_service, + test_perspective_service, + test_news_service, + test_api_endpoints + ] + + results = [] + for check in checks: + try: + result = check() + results.append(result if result is not None else False) + except Exception as e: + print(f"Check failed with error: {e}") + results.append(False) + + # Summary + print("\n" + "=" * 50) + print(" SUMMARY") + passed = sum(results) + total = len(results) + print(f"Passed: {passed}/{total} checks") + + if passed == total: + print("All checks passed! Your system is ready to use.") + print("\nNext steps:") + print("1. Start the server: python run.py") + print("2. Test the complete pipeline: POST /ml-api/fetch-and-store") + print("3. View stored news: GET /ml-api/news") + else: + print("Some checks failed. Please address the issues above.") + + return passed == total + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) diff --git a/ml-api/start_server.py b/ml-api/start_server.py new file mode 100644 index 00000000..bdca0a30 --- /dev/null +++ b/ml-api/start_server.py @@ -0,0 +1,189 @@ +""" +Enhanced Server Startup Script for SentiLog ML API + +This script: +1. Performs comprehensive pre-flight checks +2. Validates all configurations +3. Tests API connections +4. Starts the server with proper error handling + +Usage: + python start_server.py +""" + +import os +import sys +import time +import requests +from dotenv import load_dotenv + +def load_environment(): + """Load and validate environment variables.""" + print("Loading environment configuration...") + load_dotenv() + + required_vars = { + 'NEWS_API_KEY': 'News API key from newsapi.org', + 'PERSPECTIVE_API_KEY': 'Google Perspective API key' + } + + missing_vars = [] + for var, description in required_vars.items(): + value = os.getenv(var) + if not value or value == f'your_{var.lower()}': + missing_vars.append((var, description)) + else: + print(f" {var}: Configured") + + if missing_vars: + print("\n Missing required environment variables:") + for var, desc in missing_vars: + print(f" - {var}: {desc}") + print("\nPlease update your .env file with the correct API keys.") + return False + + return True + +def check_dependencies(): + """Check if all required dependencies are installed.""" + print("\n๐Ÿ” Checking dependencies...") + + required_packages = [ + 'flask', 'flask_cors', 'pymongo', 'requests', + 'vaderSentiment', 'dotenv' + ] + + missing = [] + for package in required_packages: + try: + __import__(package.replace('-', '_')) + print(f" {package}") + except ImportError: + missing.append(package) + print(f" {package}") + + if missing: + print(f"\n Missing packages: {', '.join(missing)}") + print("Run: pip install -r requirements.txt") + return False + + return True + +def test_database_connection(): + """Test MongoDB connection.""" + print("\nTesting database connection...") + try: + from app.services.db_service import get_db, get_news_collection + + db = get_db() + server_info = db.client.server_info() + collection = get_news_collection() + doc_count = collection.count_documents({}) + + print(f"Connected to database: {db.name}") + print(f"MongoDB version: {server_info.get('version')}") + print(f" Collection '{collection.name}' has {doc_count} documents") + return True + + except Exception as e: + print(f" Database connection failed: {e}") + print(" Make sure MongoDB is running: mongod") + return False + +def test_api_services(): + """Test external API services.""" + print("\n Testing external API services...") + try: + from app.services.perspective_service import analyze_toxicity_text + result = analyze_toxicity_text("This is a test message") + if isinstance(result, dict) and 'score' in result: + print(f"Perspective API: Working (test score: {result['score']:.3f})") + else: + print("Perspective API: Invalid response") + return False + except Exception as e: + print(f" Perspective API failed: {e}") + return False + try: + from app.services.news_service import fetch_news + articles = fetch_news(query="test", page_size=1) + if articles: + print(f"News API: Working (fetched {len(articles)} articles)") + else: + print(" News API: No articles returned (but connection works)") + except Exception as e: + print(f" News API failed: {e}") + return False + + return True + +def pre_flight_check(): + """Run comprehensive pre-flight checks.""" + print(" SentiLog ML API - Pre-flight Checks") + print("=" * 50) + + checks = [ + ("Environment Variables", load_environment), + ("Dependencies", check_dependencies), + ("Database Connection", test_database_connection), + ("API Services", test_api_services) + ] + + all_passed = True + for name, check_func in checks: + if not check_func(): + all_passed = False + + print("\n" + "=" * 50) + if all_passed: + print("All pre-flight checks passed!") + return True + else: + print(" Some checks failed. Please fix the issues above.") + return False + +def start_server(): + """Start the Flask server with proper configuration.""" + print("\n Starting SentiLog ML API server...") + + try: + from app import create_app + app = create_app() + host = os.getenv('HOST', '0.0.0.0') + port = int(os.getenv('PORT', 5001)) + debug = os.getenv('FLASK_ENV') == 'development' + + print(f" Server will start on: http://{host}:{port}") + print(f" Debug mode: {debug}") + print(f" CORS origins: {os.getenv('CORS_ORIGINS', '*')}") + print("\n Available endpoints:") + print(f" โ€ข Health check: http://{host}:{port}/ml-api/health") + print(f" โ€ข Toxicity analysis: http://{host}:{port}/ml-api/analyze-toxicity") + print(f" โ€ข Fetch and store news: http://{host}:{port}/ml-api/fetch-and-store") + print(f" โ€ข List news: http://{host}:{port}/ml-api/news") + print("\n Starting server... (Press Ctrl+C to stop)") + + # Start the server + app.run(host=host, port=port, debug=debug) + + except KeyboardInterrupt: + print("\nServer stopped by user") + except Exception as e: + print(f"\nServer startup failed: {e}") + return False + + return True + +def main(): + """Main entry point.""" + + if not pre_flight_check(): + print("\nCannot start server due to failed checks.") + sys.exit(1) + + print("\n" + "=" * 50) + success = start_server() + sys.exit(0 if success else 1) + +if __name__ == "__main__": + main()