Files
gfil-blog/deploy_scripts/nlp_content_audit.py
2026-06-28 17:19:47 +00:00

140 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Gemini 地下室方案 #1 替代版: 用 GitHub Models (免费) 替代 Google Cloud NLP 做实体显著性检测
GitHub Models 免费模型: gpt-4o-mini, Phi-4, Llama-3.3-70B 等
用法: python nlp_content_audit.py article.html --targets "Gold" "XAUUSD" "position sizing"
"""
import sys, io, os, re, json, urllib.request
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# GitHub Models free endpoint (via GitHub Marketplace)
# 优先读环境变量,再读项目 .env config
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
from config import GITHUB_TOKEN as _CFG_TOKEN
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') or _CFG_TOKEN or ''
if not GITHUB_TOKEN:
print("❌ 未找到 GITHUB_TOKEN。请在 .env 中设置或设置环境变量")
sys.exit(1)
MODEL = 'gpt-4o-mini' # Free tier on GitHub Models
ENDPOINT = 'https://models.inference.ai.azure.com/chat/completions'
def analyze_entities_with_llm(text, target_entities=None):
"""用 LLM 做实体显著性分析 — 替代 Google Cloud NLP"""
prompt = f"""You are a search engine content quality auditor. Analyze this trading/finance article and:
1. Extract the TOP 10 entities (people, concepts, instruments, strategies) mentioned.
2. Rate each entity's SALIENCE (prominence) from 0.0 to 1.0 based on:
- 1.0 = THE main topic, appears in title and first sentence, discussed throughout
- 0.5 = Mentioned several times but not central
- 0.1 = Passing mention only
3. If the first paragraph doesn't directly state the main topic within 15 words, flag it.
4. Score overall content focus: A (laser-focused), B (mostly focused), C (scattered), D (all over the place).
Text to analyze:
---
{text[:3000]}
---
Respond in JSON format:
{{"entities": [{{"name": "...", "salience": 0.X, "type": "..."}}], "focus_grade": "A/B/C/D", "first_paragraph_issue": true/false, "issues": ["..."]}}"""
payload = json.dumps({
"model": MODEL,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1,
"max_tokens": 1000
})
req = urllib.request.Request(ENDPOINT, data=payload.encode(),
headers={
'Content-Type': 'application/json',
'Authorization': f'Bearer {GITHUB_TOKEN}',
'User-Agent': 'GFIL-NLP-Audit/1.0'
})
try:
with urllib.request.urlopen(req, timeout=30) as resp:
result = json.loads(resp.read().decode())
content = result['choices'][0]['message']['content']
# Extract JSON from response
json_match = re.search(r'\{[\s\S]*\}', content)
if json_match:
return json.loads(json_match.group())
return {"error": "Could not parse LLM response", "raw": content[:500]}
except Exception as e:
return {"error": str(e)}
def audit_html_file(filepath, target_entities=None):
"""分析 HTML 文件文本"""
with open(filepath, 'r', encoding='utf-8') as f:
html = f.read()
title = re.search(r'<title>(.*?)</title>', html)
h1 = re.search(r'<h1>(.*?)</h1>', html)
body = re.sub(r'<[^>]+>', ' ', html)
body = re.sub(r'\s+', ' ', body).strip()
title_text = title.group(1) if title else ''
h1_text = h1.group(1) if h1 else ''
print(f"文件: {os.path.basename(filepath)}")
print(f"标题: {title_text[:100]}")
print(f"H1: {h1_text[:100]}")
print(f"分析中...")
text = f"{title_text}\n{h1_text}\n{body[:2500]}"
result = analyze_entities_with_llm(text, target_entities)
if 'error' in result:
print(f"\n❌ 分析失败: {result['error']}")
return
print(f"\n=== 内容质量审计 ===")
print(f"集中度评分: {result.get('focus_grade', 'N/A')}")
if result.get('first_paragraph_issue'):
print(f"⚠️ 第一段问题: 主题不够突出前15字未点题")
print(f"\n=== 实体显著性 Top 10 ===")
print(f"{'实体':<30} {'显著性':>8} {'类型':<15}")
print("-" * 60)
for e in result.get('entities', [])[:10]:
bar = '' * int(e['salience'] * 50)
print(f"{e['name']:<30} {e['salience']:.3f} {e.get('type', ''):<15}")
if target_entities:
print(f"\n=== 目标实体检测 ===")
entity_names = {e['name'].lower(): e for e in result.get('entities', [])}
for t in target_entities:
found = None
for name, e in entity_names.items():
if t.lower() in name:
found = e
break
if found:
ok = '✅ 达标' if found['salience'] >= 0.8 else '❌ 不够(需≥0.8)'
print(f" {t}: 显著度 {found['salience']:.3f} {ok}")
else:
print(f" {t}: ❌ 未找到! 需在标题或第一段加入")
if result.get('issues'):
print(f"\n=== 发现的问题 ===")
for issue in result['issues']:
print(f" - {issue}")
if __name__ == '__main__':
import argparse
p = argparse.ArgumentParser(description='GitHub Models 免费实体审计 (替代Google NLP)')
p.add_argument('file', help='HTML 文件路径')
p.add_argument('--targets', nargs='+', help='目标实体: --targets Gold XAUUSD "position sizing"')
args = p.parse_args()
if '1234567890' in GITHUB_TOKEN:
print("❌ 请设置 GITHUB_TOKEN 环境变量 (你的 GitHub Personal Access Token)")
print(" $env:GITHUB_TOKEN='github_pat_xxxx' # PowerShell")
sys.exit(1)
audit_html_file(args.file, args.targets)