gfil-blog/deploy_scripts/gsc_index_monitor.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
GSC 索引收割监控 — Google Search Console API
每天检查: 已编入索引 vs 已发现未索引比例 + 各语言AIO展现量
用法: python gsc_index_monitor.py
"""
import sys, io, os, json, datetime
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# RackNerd is US-based — no proxy needed

from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession

SITE = 'https://blog.quant-view.xyz'
SITE_URL = 'sc-domain:blog.quant-view.xyz'
KEY_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..',
                        'gothic-venture-498218-u0-15afe4efe6f3.json')

def get_session(scopes):
    creds = service_account.Credentials.from_service_account_file(KEY_FILE, scopes=scopes)
    s = AuthorizedSession(creds)
    # No proxy — RackNerd US server has direct internet access
    return s

def check_indexing_status():
    """拉取索引状态: 已编入索引 / 已发现未索引"""
    # Use Search Console URL Inspection API
    # For aggregate: use Webmasters v3 Search Analytics
    session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])

    today = datetime.date.today()
    week_ago = today - datetime.timedelta(days=7)

    # Search Analytics: get indexed page count by checking impressions > 0
    body = {
        'startDate': week_ago.isoformat(),
        'endDate': today.isoformat(),
        'dimensions': ['page', 'country'],
        'rowLimit': 500,
    }

    resp = session.post(
        f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/searchAnalytics/query',
        data=json.dumps(body),
        headers={'Content-Type': 'application/json'},
        timeout=30
    )

    if resp.status_code != 200:
        print(f'Search Analytics API error: {resp.status_code}')
        print(resp.text[:500])
        return None

    data = resp.json()
    rows = data.get('rows', [])

    # Count unique pages that got impressions
    indexed_pages = set()
    countries = {}
    for row in rows:
        page = row['keys'][0]
        country = row['keys'][1]
        indexed_pages.add(page)
        countries[country] = countries.get(country, 0) + row.get('impressions', 0)

    # Also get total pages from sitemap
    import urllib.request, re
    sitemap_url = f'{SITE}/sitemap.xml'
    try:
        req = urllib.request.Request(sitemap_url, headers={'User-Agent': 'GFIL-GSC/1.0'})
        with urllib.request.urlopen(req, timeout=30) as r:
            xml = r.read().decode()
        total_urls = len(re.findall(r'<loc>(https://[^<]+)</loc>', xml))
    except:
        total_urls = 259  # Fallback

    # Also pull per-language stats
    lang_stats = {}
    for lang in ['en', 'zh', 'es', 'ar']:
        if lang == 'en':
            lang_pages = [p for p in indexed_pages if '/zh/' not in p and '/es/' not in p and '/ar/' not in p]
        else:
            prefix = f'{SITE}/tools/{lang}/'
            lang_pages = [p for p in indexed_pages if prefix in p or f'/{lang}/' in p]
        lang_stats[lang] = len(lang_pages)

    return {
        'date': today.isoformat(),
        'total_sitemap_urls': total_urls,
        'indexed_pages': len(indexed_pages),
        'index_ratio': f'{len(indexed_pages)}/{total_urls} = {len(indexed_pages)*100//total_urls}%',
        'discovered_not_indexed': total_urls - len(indexed_pages),
        'countries': dict(sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]),
        'per_language': lang_stats,
        'total_impressions': sum(row.get('impressions', 0) for row in rows),
        'total_clicks': sum(row.get('clicks', 0) for row in rows),
    }

def check_sitemap_status():
    """Check sitemap submission status in GSC"""
    session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])
    resp = session.get(
        f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/sitemaps',
        timeout=30
    )
    if resp.status_code == 200:
        data = resp.json()
        sitemaps = data.get('sitemap', [])
        results = []
        for s in sitemaps:
            results.append({
                'path': s.get('path', ''),
                'submitted': s.get('lastSubmitted', 'N/A'),
                'downloaded': s.get('lastDownloaded', 'N/A'),
                'urls': s.get('contents', [{}])[0].get('submitted', 0) if s.get('contents') else 0,
                'warnings': s.get('warnings', 0),
                'errors': s.get('errors', 0),
            })
        return results
    return None

if __name__ == '__main__':
    print(f'=== GSC Index Harvest Monitor ===')
    print(f'Site: {SITE}')
    print(f'Time: {datetime.datetime.now().isoformat()}\n')

    if not os.path.exists(KEY_FILE):
        print('Service account key not found. Skipping GSC API.')
        sys.exit(0)

    # 1. Indexing status
    print('--- Index Status ---')
    stats = check_indexing_status()
    if stats:
        print(f'  Sitemap URLs:       {stats["total_sitemap_urls"]}')
        print(f'  Indexed (7d imp):   {stats["indexed_pages"]}')
        print(f'  Index Ratio:        {stats["index_ratio"]}')
        print(f'  Discovered/NotIdx:  {stats["discovered_not_indexed"]}')
        print(f'  Total Impressions:  {stats["total_impressions"]}')
        print(f'  Total Clicks:       {stats["total_clicks"]}')
        print(f'\n  Per Language:')
        for lang, count in stats['per_language'].items():
            print(f'    {lang}: {count} indexed pages')
        print(f'\n  Top Countries:')
        for country, imps in stats['countries'].items():
            print(f'    {country}: {imps} impressions')

    # 2. Sitemap status
    print(f'\n--- Sitemap Status ---')
    sm_status = check_sitemap_status()
    if sm_status:
        for s in sm_status:
            print(f'  {s["path"]}: {s["urls"]} URLs, {s.get("errors",0)} errors, {s.get("warnings",0)} warnings')

    print(f'\n=== Done ===')
Update README 2026-06-28 17:19:47 +00:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`"""`
			`GSC 索引收割监控 — Google Search Console API`
			`每天检查: 已编入索引 vs 已发现未索引比例 + 各语言AIO展现量`
			`用法: python gsc_index_monitor.py`
			`"""`
			`import sys, io, os, json, datetime`
			`sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')`
			`# RackNerd is US-based — no proxy needed`

			`from google.oauth2 import service_account`
			`from google.auth.transport.requests import AuthorizedSession`

			`SITE = 'https://blog.quant-view.xyz'`
			`SITE_URL = 'sc-domain:blog.quant-view.xyz'`
			`KEY_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..',`
			`'gothic-venture-498218-u0-15afe4efe6f3.json')`

			`def get_session(scopes):`
			`creds = service_account.Credentials.from_service_account_file(KEY_FILE, scopes=scopes)`
			`s = AuthorizedSession(creds)`
			`# No proxy — RackNerd US server has direct internet access`
			`return s`

			`def check_indexing_status():`
			`"""拉取索引状态: 已编入索引 / 已发现未索引"""`
			`# Use Search Console URL Inspection API`
			`# For aggregate: use Webmasters v3 Search Analytics`
			`session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])`

			`today = datetime.date.today()`
			`week_ago = today - datetime.timedelta(days=7)`

			`# Search Analytics: get indexed page count by checking impressions > 0`
			`body = {`
			`'startDate': week_ago.isoformat(),`
			`'endDate': today.isoformat(),`
			`'dimensions': ['page', 'country'],`
			`'rowLimit': 500,`
			`}`

			`resp = session.post(`
			`f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/searchAnalytics/query',`
			`data=json.dumps(body),`
			`headers={'Content-Type': 'application/json'},`
			`timeout=30`
			`)`

			`if resp.status_code != 200:`
			`print(f'Search Analytics API error: {resp.status_code}')`
			`print(resp.text[:500])`
			`return None`

			`data = resp.json()`
			`rows = data.get('rows', [])`

			`# Count unique pages that got impressions`
			`indexed_pages = set()`
			`countries = {}`
			`for row in rows:`
			`page = row['keys'][0]`
			`country = row['keys'][1]`
			`indexed_pages.add(page)`
			`countries[country] = countries.get(country, 0) + row.get('impressions', 0)`

			`# Also get total pages from sitemap`
			`import urllib.request, re`
			`sitemap_url = f'{SITE}/sitemap.xml'`
			`try:`
			`req = urllib.request.Request(sitemap_url, headers={'User-Agent': 'GFIL-GSC/1.0'})`
			`with urllib.request.urlopen(req, timeout=30) as r:`
			`xml = r.read().decode()`
			`total_urls = len(re.findall(r'<loc>(https://[^<]+)</loc>', xml))`
			`except:`
			`total_urls = 259 # Fallback`

			`# Also pull per-language stats`
			`lang_stats = {}`
			`for lang in ['en', 'zh', 'es', 'ar']:`
			`if lang == 'en':`
			`lang_pages = [p for p in indexed_pages if '/zh/' not in p and '/es/' not in p and '/ar/' not in p]`
			`else:`
			`prefix = f'{SITE}/tools/{lang}/'`
			`lang_pages = [p for p in indexed_pages if prefix in p or f'/{lang}/' in p]`
			`lang_stats[lang] = len(lang_pages)`

			`return {`
			`'date': today.isoformat(),`
			`'total_sitemap_urls': total_urls,`
			`'indexed_pages': len(indexed_pages),`
			`'index_ratio': f'{len(indexed_pages)}/{total_urls} = {len(indexed_pages)*100//total_urls}%',`
			`'discovered_not_indexed': total_urls - len(indexed_pages),`
			`'countries': dict(sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]),`
			`'per_language': lang_stats,`
			`'total_impressions': sum(row.get('impressions', 0) for row in rows),`
			`'total_clicks': sum(row.get('clicks', 0) for row in rows),`
			`}`

			`def check_sitemap_status():`
			`"""Check sitemap submission status in GSC"""`
			`session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])`
			`resp = session.get(`
			`f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/sitemaps',`
			`timeout=30`
			`)`
			`if resp.status_code == 200:`
			`data = resp.json()`
			`sitemaps = data.get('sitemap', [])`
			`results = []`
			`for s in sitemaps:`
			`results.append({`
			`'path': s.get('path', ''),`
			`'submitted': s.get('lastSubmitted', 'N/A'),`
			`'downloaded': s.get('lastDownloaded', 'N/A'),`
			`'urls': s.get('contents', [{}])[0].get('submitted', 0) if s.get('contents') else 0,`
			`'warnings': s.get('warnings', 0),`
			`'errors': s.get('errors', 0),`
			`})`
			`return results`
			`return None`

			`if __name__ == '__main__':`
			`print(f'=== GSC Index Harvest Monitor ===')`
			`print(f'Site: {SITE}')`
			`print(f'Time: {datetime.datetime.now().isoformat()}\n')`

			`if not os.path.exists(KEY_FILE):`
			`print('Service account key not found. Skipping GSC API.')`
			`sys.exit(0)`

			`# 1. Indexing status`
			`print('--- Index Status ---')`
			`stats = check_indexing_status()`
			`if stats:`
			`print(f' Sitemap URLs: {stats["total_sitemap_urls"]}')`
			`print(f' Indexed (7d imp): {stats["indexed_pages"]}')`
			`print(f' Index Ratio: {stats["index_ratio"]}')`
			`print(f' Discovered/NotIdx: {stats["discovered_not_indexed"]}')`
			`print(f' Total Impressions: {stats["total_impressions"]}')`
			`print(f' Total Clicks: {stats["total_clicks"]}')`
			`print(f'\n Per Language:')`
			`for lang, count in stats['per_language'].items():`
			`print(f' {lang}: {count} indexed pages')`
			`print(f'\n Top Countries:')`
			`for country, imps in stats['countries'].items():`
			`print(f' {country}: {imps} impressions')`

			`# 2. Sitemap status`
			`print(f'\n--- Sitemap Status ---')`
			`sm_status = check_sitemap_status()`
			`if sm_status:`
			`for s in sm_status:`
			`print(f' {s["path"]}: {s["urls"]} URLs, {s.get("errors",0)} errors, {s.get("warnings",0)} warnings')`

			`print(f'\n=== Done ===')`