Files
gfil-blog/deploy_scripts/gsc_index_monitor.py

157 lines
5.8 KiB
Python
Raw Normal View History

2026-06-28 17:19:47 +00:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
GSC 索引收割监控 Google Search Console API
每天检查: 已编入索引 vs 已发现未索引比例 + 各语言AIO展现量
用法: python gsc_index_monitor.py
"""
import sys, io, os, json, datetime
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# RackNerd is US-based — no proxy needed
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
SITE = 'https://blog.quant-view.xyz'
SITE_URL = 'sc-domain:blog.quant-view.xyz'
KEY_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..',
'gothic-venture-498218-u0-15afe4efe6f3.json')
def get_session(scopes):
creds = service_account.Credentials.from_service_account_file(KEY_FILE, scopes=scopes)
s = AuthorizedSession(creds)
# No proxy — RackNerd US server has direct internet access
return s
def check_indexing_status():
"""拉取索引状态: 已编入索引 / 已发现未索引"""
# Use Search Console URL Inspection API
# For aggregate: use Webmasters v3 Search Analytics
session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])
today = datetime.date.today()
week_ago = today - datetime.timedelta(days=7)
# Search Analytics: get indexed page count by checking impressions > 0
body = {
'startDate': week_ago.isoformat(),
'endDate': today.isoformat(),
'dimensions': ['page', 'country'],
'rowLimit': 500,
}
resp = session.post(
f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/searchAnalytics/query',
data=json.dumps(body),
headers={'Content-Type': 'application/json'},
timeout=30
)
if resp.status_code != 200:
print(f'Search Analytics API error: {resp.status_code}')
print(resp.text[:500])
return None
data = resp.json()
rows = data.get('rows', [])
# Count unique pages that got impressions
indexed_pages = set()
countries = {}
for row in rows:
page = row['keys'][0]
country = row['keys'][1]
indexed_pages.add(page)
countries[country] = countries.get(country, 0) + row.get('impressions', 0)
# Also get total pages from sitemap
import urllib.request, re
sitemap_url = f'{SITE}/sitemap.xml'
try:
req = urllib.request.Request(sitemap_url, headers={'User-Agent': 'GFIL-GSC/1.0'})
with urllib.request.urlopen(req, timeout=30) as r:
xml = r.read().decode()
total_urls = len(re.findall(r'<loc>(https://[^<]+)</loc>', xml))
except:
total_urls = 259 # Fallback
# Also pull per-language stats
lang_stats = {}
for lang in ['en', 'zh', 'es', 'ar']:
if lang == 'en':
lang_pages = [p for p in indexed_pages if '/zh/' not in p and '/es/' not in p and '/ar/' not in p]
else:
prefix = f'{SITE}/tools/{lang}/'
lang_pages = [p for p in indexed_pages if prefix in p or f'/{lang}/' in p]
lang_stats[lang] = len(lang_pages)
return {
'date': today.isoformat(),
'total_sitemap_urls': total_urls,
'indexed_pages': len(indexed_pages),
'index_ratio': f'{len(indexed_pages)}/{total_urls} = {len(indexed_pages)*100//total_urls}%',
'discovered_not_indexed': total_urls - len(indexed_pages),
'countries': dict(sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]),
'per_language': lang_stats,
'total_impressions': sum(row.get('impressions', 0) for row in rows),
'total_clicks': sum(row.get('clicks', 0) for row in rows),
}
def check_sitemap_status():
"""Check sitemap submission status in GSC"""
session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])
resp = session.get(
f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/sitemaps',
timeout=30
)
if resp.status_code == 200:
data = resp.json()
sitemaps = data.get('sitemap', [])
results = []
for s in sitemaps:
results.append({
'path': s.get('path', ''),
'submitted': s.get('lastSubmitted', 'N/A'),
'downloaded': s.get('lastDownloaded', 'N/A'),
'urls': s.get('contents', [{}])[0].get('submitted', 0) if s.get('contents') else 0,
'warnings': s.get('warnings', 0),
'errors': s.get('errors', 0),
})
return results
return None
if __name__ == '__main__':
print(f'=== GSC Index Harvest Monitor ===')
print(f'Site: {SITE}')
print(f'Time: {datetime.datetime.now().isoformat()}\n')
if not os.path.exists(KEY_FILE):
print('Service account key not found. Skipping GSC API.')
sys.exit(0)
# 1. Indexing status
print('--- Index Status ---')
stats = check_indexing_status()
if stats:
print(f' Sitemap URLs: {stats["total_sitemap_urls"]}')
print(f' Indexed (7d imp): {stats["indexed_pages"]}')
print(f' Index Ratio: {stats["index_ratio"]}')
print(f' Discovered/NotIdx: {stats["discovered_not_indexed"]}')
print(f' Total Impressions: {stats["total_impressions"]}')
print(f' Total Clicks: {stats["total_clicks"]}')
print(f'\n Per Language:')
for lang, count in stats['per_language'].items():
print(f' {lang}: {count} indexed pages')
print(f'\n Top Countries:')
for country, imps in stats['countries'].items():
print(f' {country}: {imps} impressions')
# 2. Sitemap status
print(f'\n--- Sitemap Status ---')
sm_status = check_sitemap_status()
if sm_status:
for s in sm_status:
print(f' {s["path"]}: {s["urls"]} URLs, {s.get("errors",0)} errors, {s.get("warnings",0)} warnings')
print(f'\n=== Done ===')