140 lines
5.5 KiB
Python
140 lines
5.5 KiB
Python
|
|
#!/usr/bin/env python
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
Gemini 地下室方案 #1 替代版: 用 GitHub Models (免费) 替代 Google Cloud NLP 做实体显著性检测
|
|||
|
|
GitHub Models 免费模型: gpt-4o-mini, Phi-4, Llama-3.3-70B 等
|
|||
|
|
用法: python nlp_content_audit.py article.html --targets "Gold" "XAUUSD" "position sizing"
|
|||
|
|
"""
|
|||
|
|
import sys, io, os, re, json, urllib.request
|
|||
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|||
|
|
|
|||
|
|
# GitHub Models free endpoint (via GitHub Marketplace)
|
|||
|
|
# 优先读环境变量,再读项目 .env config
|
|||
|
|
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
|
|||
|
|
from config import GITHUB_TOKEN as _CFG_TOKEN
|
|||
|
|
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') or _CFG_TOKEN or ''
|
|||
|
|
if not GITHUB_TOKEN:
|
|||
|
|
print("❌ 未找到 GITHUB_TOKEN。请在 .env 中设置或设置环境变量")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
MODEL = 'gpt-4o-mini' # Free tier on GitHub Models
|
|||
|
|
ENDPOINT = 'https://models.inference.ai.azure.com/chat/completions'
|
|||
|
|
|
|||
|
|
def analyze_entities_with_llm(text, target_entities=None):
|
|||
|
|
"""用 LLM 做实体显著性分析 — 替代 Google Cloud NLP"""
|
|||
|
|
|
|||
|
|
prompt = f"""You are a search engine content quality auditor. Analyze this trading/finance article and:
|
|||
|
|
|
|||
|
|
1. Extract the TOP 10 entities (people, concepts, instruments, strategies) mentioned.
|
|||
|
|
2. Rate each entity's SALIENCE (prominence) from 0.0 to 1.0 based on:
|
|||
|
|
- 1.0 = THE main topic, appears in title and first sentence, discussed throughout
|
|||
|
|
- 0.5 = Mentioned several times but not central
|
|||
|
|
- 0.1 = Passing mention only
|
|||
|
|
3. If the first paragraph doesn't directly state the main topic within 15 words, flag it.
|
|||
|
|
4. Score overall content focus: A (laser-focused), B (mostly focused), C (scattered), D (all over the place).
|
|||
|
|
|
|||
|
|
Text to analyze:
|
|||
|
|
---
|
|||
|
|
{text[:3000]}
|
|||
|
|
---
|
|||
|
|
|
|||
|
|
Respond in JSON format:
|
|||
|
|
{{"entities": [{{"name": "...", "salience": 0.X, "type": "..."}}], "focus_grade": "A/B/C/D", "first_paragraph_issue": true/false, "issues": ["..."]}}"""
|
|||
|
|
|
|||
|
|
payload = json.dumps({
|
|||
|
|
"model": MODEL,
|
|||
|
|
"messages": [{"role": "user", "content": prompt}],
|
|||
|
|
"temperature": 0.1,
|
|||
|
|
"max_tokens": 1000
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
req = urllib.request.Request(ENDPOINT, data=payload.encode(),
|
|||
|
|
headers={
|
|||
|
|
'Content-Type': 'application/json',
|
|||
|
|
'Authorization': f'Bearer {GITHUB_TOKEN}',
|
|||
|
|
'User-Agent': 'GFIL-NLP-Audit/1.0'
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|||
|
|
result = json.loads(resp.read().decode())
|
|||
|
|
content = result['choices'][0]['message']['content']
|
|||
|
|
# Extract JSON from response
|
|||
|
|
json_match = re.search(r'\{[\s\S]*\}', content)
|
|||
|
|
if json_match:
|
|||
|
|
return json.loads(json_match.group())
|
|||
|
|
return {"error": "Could not parse LLM response", "raw": content[:500]}
|
|||
|
|
except Exception as e:
|
|||
|
|
return {"error": str(e)}
|
|||
|
|
|
|||
|
|
def audit_html_file(filepath, target_entities=None):
|
|||
|
|
"""分析 HTML 文件文本"""
|
|||
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|||
|
|
html = f.read()
|
|||
|
|
|
|||
|
|
title = re.search(r'<title>(.*?)</title>', html)
|
|||
|
|
h1 = re.search(r'<h1>(.*?)</h1>', html)
|
|||
|
|
body = re.sub(r'<[^>]+>', ' ', html)
|
|||
|
|
body = re.sub(r'\s+', ' ', body).strip()
|
|||
|
|
|
|||
|
|
title_text = title.group(1) if title else ''
|
|||
|
|
h1_text = h1.group(1) if h1 else ''
|
|||
|
|
|
|||
|
|
print(f"文件: {os.path.basename(filepath)}")
|
|||
|
|
print(f"标题: {title_text[:100]}")
|
|||
|
|
print(f"H1: {h1_text[:100]}")
|
|||
|
|
print(f"分析中...")
|
|||
|
|
|
|||
|
|
text = f"{title_text}\n{h1_text}\n{body[:2500]}"
|
|||
|
|
result = analyze_entities_with_llm(text, target_entities)
|
|||
|
|
|
|||
|
|
if 'error' in result:
|
|||
|
|
print(f"\n❌ 分析失败: {result['error']}")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
print(f"\n=== 内容质量审计 ===")
|
|||
|
|
print(f"集中度评分: {result.get('focus_grade', 'N/A')}")
|
|||
|
|
if result.get('first_paragraph_issue'):
|
|||
|
|
print(f"⚠️ 第一段问题: 主题不够突出(前15字未点题)")
|
|||
|
|
|
|||
|
|
print(f"\n=== 实体显著性 Top 10 ===")
|
|||
|
|
print(f"{'实体':<30} {'显著性':>8} {'类型':<15}")
|
|||
|
|
print("-" * 60)
|
|||
|
|
for e in result.get('entities', [])[:10]:
|
|||
|
|
bar = '█' * int(e['salience'] * 50)
|
|||
|
|
print(f"{e['name']:<30} {e['salience']:.3f} {e.get('type', ''):<15}")
|
|||
|
|
|
|||
|
|
if target_entities:
|
|||
|
|
print(f"\n=== 目标实体检测 ===")
|
|||
|
|
entity_names = {e['name'].lower(): e for e in result.get('entities', [])}
|
|||
|
|
for t in target_entities:
|
|||
|
|
found = None
|
|||
|
|
for name, e in entity_names.items():
|
|||
|
|
if t.lower() in name:
|
|||
|
|
found = e
|
|||
|
|
break
|
|||
|
|
if found:
|
|||
|
|
ok = '✅ 达标' if found['salience'] >= 0.8 else '❌ 不够(需≥0.8)'
|
|||
|
|
print(f" {t}: 显著度 {found['salience']:.3f} {ok}")
|
|||
|
|
else:
|
|||
|
|
print(f" {t}: ❌ 未找到! 需在标题或第一段加入")
|
|||
|
|
|
|||
|
|
if result.get('issues'):
|
|||
|
|
print(f"\n=== 发现的问题 ===")
|
|||
|
|
for issue in result['issues']:
|
|||
|
|
print(f" - {issue}")
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
import argparse
|
|||
|
|
p = argparse.ArgumentParser(description='GitHub Models 免费实体审计 (替代Google NLP)')
|
|||
|
|
p.add_argument('file', help='HTML 文件路径')
|
|||
|
|
p.add_argument('--targets', nargs='+', help='目标实体: --targets Gold XAUUSD "position sizing"')
|
|||
|
|
args = p.parse_args()
|
|||
|
|
|
|||
|
|
if '1234567890' in GITHUB_TOKEN:
|
|||
|
|
print("❌ 请设置 GITHUB_TOKEN 环境变量 (你的 GitHub Personal Access Token)")
|
|||
|
|
print(" $env:GITHUB_TOKEN='github_pat_xxxx' # PowerShell")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
audit_html_file(args.file, args.targets)
|