#!/usr/bin/env python # -*- coding: utf-8 -*- """ Gemini 地下室方案 #1 替代版: 用 GitHub Models (免费) 替代 Google Cloud NLP 做实体显著性检测 GitHub Models 免费模型: gpt-4o-mini, Phi-4, Llama-3.3-70B 等 用法: python nlp_content_audit.py article.html --targets "Gold" "XAUUSD" "position sizing" """ import sys, io, os, re, json, urllib.request sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') # GitHub Models free endpoint (via GitHub Marketplace) # 优先读环境变量,再读项目 .env config sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) from config import GITHUB_TOKEN as _CFG_TOKEN GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') or _CFG_TOKEN or '' if not GITHUB_TOKEN: print("❌ 未找到 GITHUB_TOKEN。请在 .env 中设置或设置环境变量") sys.exit(1) MODEL = 'gpt-4o-mini' # Free tier on GitHub Models ENDPOINT = 'https://models.inference.ai.azure.com/chat/completions' def analyze_entities_with_llm(text, target_entities=None): """用 LLM 做实体显著性分析 — 替代 Google Cloud NLP""" prompt = f"""You are a search engine content quality auditor. Analyze this trading/finance article and: 1. Extract the TOP 10 entities (people, concepts, instruments, strategies) mentioned. 2. Rate each entity's SALIENCE (prominence) from 0.0 to 1.0 based on: - 1.0 = THE main topic, appears in title and first sentence, discussed throughout - 0.5 = Mentioned several times but not central - 0.1 = Passing mention only 3. If the first paragraph doesn't directly state the main topic within 15 words, flag it. 4. Score overall content focus: A (laser-focused), B (mostly focused), C (scattered), D (all over the place). Text to analyze: --- {text[:3000]} --- Respond in JSON format: {{"entities": [{{"name": "...", "salience": 0.X, "type": "..."}}], "focus_grade": "A/B/C/D", "first_paragraph_issue": true/false, "issues": ["..."]}}""" payload = json.dumps({ "model": MODEL, "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, "max_tokens": 1000 }) req = urllib.request.Request(ENDPOINT, data=payload.encode(), headers={ 'Content-Type': 'application/json', 'Authorization': f'Bearer {GITHUB_TOKEN}', 'User-Agent': 'GFIL-NLP-Audit/1.0' }) try: with urllib.request.urlopen(req, timeout=30) as resp: result = json.loads(resp.read().decode()) content = result['choices'][0]['message']['content'] # Extract JSON from response json_match = re.search(r'\{[\s\S]*\}', content) if json_match: return json.loads(json_match.group()) return {"error": "Could not parse LLM response", "raw": content[:500]} except Exception as e: return {"error": str(e)} def audit_html_file(filepath, target_entities=None): """分析 HTML 文件文本""" with open(filepath, 'r', encoding='utf-8') as f: html = f.read() title = re.search(r'