157 lines
5.8 KiB
Python
157 lines
5.8 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
GSC 索引收割监控 — Google Search Console API
|
|
每天检查: 已编入索引 vs 已发现未索引比例 + 各语言AIO展现量
|
|
用法: python gsc_index_monitor.py
|
|
"""
|
|
import sys, io, os, json, datetime
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
# RackNerd is US-based — no proxy needed
|
|
|
|
from google.oauth2 import service_account
|
|
from google.auth.transport.requests import AuthorizedSession
|
|
|
|
SITE = 'https://blog.quant-view.xyz'
|
|
SITE_URL = 'sc-domain:blog.quant-view.xyz'
|
|
KEY_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..',
|
|
'gothic-venture-498218-u0-15afe4efe6f3.json')
|
|
|
|
def get_session(scopes):
|
|
creds = service_account.Credentials.from_service_account_file(KEY_FILE, scopes=scopes)
|
|
s = AuthorizedSession(creds)
|
|
# No proxy — RackNerd US server has direct internet access
|
|
return s
|
|
|
|
def check_indexing_status():
|
|
"""拉取索引状态: 已编入索引 / 已发现未索引"""
|
|
# Use Search Console URL Inspection API
|
|
# For aggregate: use Webmasters v3 Search Analytics
|
|
session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])
|
|
|
|
today = datetime.date.today()
|
|
week_ago = today - datetime.timedelta(days=7)
|
|
|
|
# Search Analytics: get indexed page count by checking impressions > 0
|
|
body = {
|
|
'startDate': week_ago.isoformat(),
|
|
'endDate': today.isoformat(),
|
|
'dimensions': ['page', 'country'],
|
|
'rowLimit': 500,
|
|
}
|
|
|
|
resp = session.post(
|
|
f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/searchAnalytics/query',
|
|
data=json.dumps(body),
|
|
headers={'Content-Type': 'application/json'},
|
|
timeout=30
|
|
)
|
|
|
|
if resp.status_code != 200:
|
|
print(f'Search Analytics API error: {resp.status_code}')
|
|
print(resp.text[:500])
|
|
return None
|
|
|
|
data = resp.json()
|
|
rows = data.get('rows', [])
|
|
|
|
# Count unique pages that got impressions
|
|
indexed_pages = set()
|
|
countries = {}
|
|
for row in rows:
|
|
page = row['keys'][0]
|
|
country = row['keys'][1]
|
|
indexed_pages.add(page)
|
|
countries[country] = countries.get(country, 0) + row.get('impressions', 0)
|
|
|
|
# Also get total pages from sitemap
|
|
import urllib.request, re
|
|
sitemap_url = f'{SITE}/sitemap.xml'
|
|
try:
|
|
req = urllib.request.Request(sitemap_url, headers={'User-Agent': 'GFIL-GSC/1.0'})
|
|
with urllib.request.urlopen(req, timeout=30) as r:
|
|
xml = r.read().decode()
|
|
total_urls = len(re.findall(r'<loc>(https://[^<]+)</loc>', xml))
|
|
except:
|
|
total_urls = 259 # Fallback
|
|
|
|
# Also pull per-language stats
|
|
lang_stats = {}
|
|
for lang in ['en', 'zh', 'es', 'ar']:
|
|
if lang == 'en':
|
|
lang_pages = [p for p in indexed_pages if '/zh/' not in p and '/es/' not in p and '/ar/' not in p]
|
|
else:
|
|
prefix = f'{SITE}/tools/{lang}/'
|
|
lang_pages = [p for p in indexed_pages if prefix in p or f'/{lang}/' in p]
|
|
lang_stats[lang] = len(lang_pages)
|
|
|
|
return {
|
|
'date': today.isoformat(),
|
|
'total_sitemap_urls': total_urls,
|
|
'indexed_pages': len(indexed_pages),
|
|
'index_ratio': f'{len(indexed_pages)}/{total_urls} = {len(indexed_pages)*100//total_urls}%',
|
|
'discovered_not_indexed': total_urls - len(indexed_pages),
|
|
'countries': dict(sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]),
|
|
'per_language': lang_stats,
|
|
'total_impressions': sum(row.get('impressions', 0) for row in rows),
|
|
'total_clicks': sum(row.get('clicks', 0) for row in rows),
|
|
}
|
|
|
|
def check_sitemap_status():
|
|
"""Check sitemap submission status in GSC"""
|
|
session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])
|
|
resp = session.get(
|
|
f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/sitemaps',
|
|
timeout=30
|
|
)
|
|
if resp.status_code == 200:
|
|
data = resp.json()
|
|
sitemaps = data.get('sitemap', [])
|
|
results = []
|
|
for s in sitemaps:
|
|
results.append({
|
|
'path': s.get('path', ''),
|
|
'submitted': s.get('lastSubmitted', 'N/A'),
|
|
'downloaded': s.get('lastDownloaded', 'N/A'),
|
|
'urls': s.get('contents', [{}])[0].get('submitted', 0) if s.get('contents') else 0,
|
|
'warnings': s.get('warnings', 0),
|
|
'errors': s.get('errors', 0),
|
|
})
|
|
return results
|
|
return None
|
|
|
|
if __name__ == '__main__':
|
|
print(f'=== GSC Index Harvest Monitor ===')
|
|
print(f'Site: {SITE}')
|
|
print(f'Time: {datetime.datetime.now().isoformat()}\n')
|
|
|
|
if not os.path.exists(KEY_FILE):
|
|
print('Service account key not found. Skipping GSC API.')
|
|
sys.exit(0)
|
|
|
|
# 1. Indexing status
|
|
print('--- Index Status ---')
|
|
stats = check_indexing_status()
|
|
if stats:
|
|
print(f' Sitemap URLs: {stats["total_sitemap_urls"]}')
|
|
print(f' Indexed (7d imp): {stats["indexed_pages"]}')
|
|
print(f' Index Ratio: {stats["index_ratio"]}')
|
|
print(f' Discovered/NotIdx: {stats["discovered_not_indexed"]}')
|
|
print(f' Total Impressions: {stats["total_impressions"]}')
|
|
print(f' Total Clicks: {stats["total_clicks"]}')
|
|
print(f'\n Per Language:')
|
|
for lang, count in stats['per_language'].items():
|
|
print(f' {lang}: {count} indexed pages')
|
|
print(f'\n Top Countries:')
|
|
for country, imps in stats['countries'].items():
|
|
print(f' {country}: {imps} impressions')
|
|
|
|
# 2. Sitemap status
|
|
print(f'\n--- Sitemap Status ---')
|
|
sm_status = check_sitemap_status()
|
|
if sm_status:
|
|
for s in sm_status:
|
|
print(f' {s["path"]}: {s["urls"]} URLs, {s.get("errors",0)} errors, {s.get("warnings",0)} warnings')
|
|
|
|
print(f'\n=== Done ===')
|