#!/usr/bin/env python # -*- coding: utf-8 -*- """ GSC 索引收割监控 — Google Search Console API 每天检查: 已编入索引 vs 已发现未索引比例 + 各语言AIO展现量 用法: python gsc_index_monitor.py """ import sys, io, os, json, datetime sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') # RackNerd is US-based — no proxy needed from google.oauth2 import service_account from google.auth.transport.requests import AuthorizedSession SITE = 'https://blog.quant-view.xyz' SITE_URL = 'sc-domain:blog.quant-view.xyz' KEY_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'gothic-venture-498218-u0-15afe4efe6f3.json') def get_session(scopes): creds = service_account.Credentials.from_service_account_file(KEY_FILE, scopes=scopes) s = AuthorizedSession(creds) # No proxy — RackNerd US server has direct internet access return s def check_indexing_status(): """拉取索引状态: 已编入索引 / 已发现未索引""" # Use Search Console URL Inspection API # For aggregate: use Webmasters v3 Search Analytics session = get_session(['https://www.googleapis.com/auth/webmasters.readonly']) today = datetime.date.today() week_ago = today - datetime.timedelta(days=7) # Search Analytics: get indexed page count by checking impressions > 0 body = { 'startDate': week_ago.isoformat(), 'endDate': today.isoformat(), 'dimensions': ['page', 'country'], 'rowLimit': 500, } resp = session.post( f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/searchAnalytics/query', data=json.dumps(body), headers={'Content-Type': 'application/json'}, timeout=30 ) if resp.status_code != 200: print(f'Search Analytics API error: {resp.status_code}') print(resp.text[:500]) return None data = resp.json() rows = data.get('rows', []) # Count unique pages that got impressions indexed_pages = set() countries = {} for row in rows: page = row['keys'][0] country = row['keys'][1] indexed_pages.add(page) countries[country] = countries.get(country, 0) + row.get('impressions', 0) # Also get total pages from sitemap import urllib.request, re sitemap_url = f'{SITE}/sitemap.xml' try: req = urllib.request.Request(sitemap_url, headers={'User-Agent': 'GFIL-GSC/1.0'}) with urllib.request.urlopen(req, timeout=30) as r: xml = r.read().decode() total_urls = len(re.findall(r'(https://[^<]+)', xml)) except: total_urls = 259 # Fallback # Also pull per-language stats lang_stats = {} for lang in ['en', 'zh', 'es', 'ar']: if lang == 'en': lang_pages = [p for p in indexed_pages if '/zh/' not in p and '/es/' not in p and '/ar/' not in p] else: prefix = f'{SITE}/tools/{lang}/' lang_pages = [p for p in indexed_pages if prefix in p or f'/{lang}/' in p] lang_stats[lang] = len(lang_pages) return { 'date': today.isoformat(), 'total_sitemap_urls': total_urls, 'indexed_pages': len(indexed_pages), 'index_ratio': f'{len(indexed_pages)}/{total_urls} = {len(indexed_pages)*100//total_urls}%', 'discovered_not_indexed': total_urls - len(indexed_pages), 'countries': dict(sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]), 'per_language': lang_stats, 'total_impressions': sum(row.get('impressions', 0) for row in rows), 'total_clicks': sum(row.get('clicks', 0) for row in rows), } def check_sitemap_status(): """Check sitemap submission status in GSC""" session = get_session(['https://www.googleapis.com/auth/webmasters.readonly']) resp = session.get( f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/sitemaps', timeout=30 ) if resp.status_code == 200: data = resp.json() sitemaps = data.get('sitemap', []) results = [] for s in sitemaps: results.append({ 'path': s.get('path', ''), 'submitted': s.get('lastSubmitted', 'N/A'), 'downloaded': s.get('lastDownloaded', 'N/A'), 'urls': s.get('contents', [{}])[0].get('submitted', 0) if s.get('contents') else 0, 'warnings': s.get('warnings', 0), 'errors': s.get('errors', 0), }) return results return None if __name__ == '__main__': print(f'=== GSC Index Harvest Monitor ===') print(f'Site: {SITE}') print(f'Time: {datetime.datetime.now().isoformat()}\n') if not os.path.exists(KEY_FILE): print('Service account key not found. Skipping GSC API.') sys.exit(0) # 1. Indexing status print('--- Index Status ---') stats = check_indexing_status() if stats: print(f' Sitemap URLs: {stats["total_sitemap_urls"]}') print(f' Indexed (7d imp): {stats["indexed_pages"]}') print(f' Index Ratio: {stats["index_ratio"]}') print(f' Discovered/NotIdx: {stats["discovered_not_indexed"]}') print(f' Total Impressions: {stats["total_impressions"]}') print(f' Total Clicks: {stats["total_clicks"]}') print(f'\n Per Language:') for lang, count in stats['per_language'].items(): print(f' {lang}: {count} indexed pages') print(f'\n Top Countries:') for country, imps in stats['countries'].items(): print(f' {country}: {imps} impressions') # 2. Sitemap status print(f'\n--- Sitemap Status ---') sm_status = check_sitemap_status() if sm_status: for s in sm_status: print(f' {s["path"]}: {s["urls"]} URLs, {s.get("errors",0)} errors, {s.get("warnings",0)} warnings') print(f'\n=== Done ===')