#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
GSC 索引收割监控 — Google Search Console API
每天检查: 已编入索引 vs 已发现未索引比例 + 各语言AIO展现量
用法: python gsc_index_monitor.py
"""
import sys, io, os, json, datetime
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
# RackNerd is US-based — no proxy needed

from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession

SITE = 'https://blog.quant-view.xyz'
SITE_URL = 'sc-domain:blog.quant-view.xyz'
KEY_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..',
                        'gothic-venture-498218-u0-15afe4efe6f3.json')

def get_session(scopes):
    creds = service_account.Credentials.from_service_account_file(KEY_FILE, scopes=scopes)
    s = AuthorizedSession(creds)
    # No proxy — RackNerd US server has direct internet access
    return s

def check_indexing_status():
    """拉取索引状态: 已编入索引 / 已发现未索引"""
    # Use Search Console URL Inspection API
    # For aggregate: use Webmasters v3 Search Analytics
    session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])

    today = datetime.date.today()
    week_ago = today - datetime.timedelta(days=7)

    # Search Analytics: get indexed page count by checking impressions > 0
    body = {
        'startDate': week_ago.isoformat(),
        'endDate': today.isoformat(),
        'dimensions': ['page', 'country'],
        'rowLimit': 500,
    }

    resp = session.post(
        f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/searchAnalytics/query',
        data=json.dumps(body),
        headers={'Content-Type': 'application/json'},
        timeout=30
    )

    if resp.status_code != 200:
        print(f'Search Analytics API error: {resp.status_code}')
        print(resp.text[:500])
        return None

    data = resp.json()
    rows = data.get('rows', [])

    # Count unique pages that got impressions
    indexed_pages = set()
    countries = {}
    for row in rows:
        page = row['keys'][0]
        country = row['keys'][1]
        indexed_pages.add(page)
        countries[country] = countries.get(country, 0) + row.get('impressions', 0)

    # Also get total pages from sitemap
    import urllib.request, re
    sitemap_url = f'{SITE}/sitemap.xml'
    try:
        req = urllib.request.Request(sitemap_url, headers={'User-Agent': 'GFIL-GSC/1.0'})
        with urllib.request.urlopen(req, timeout=30) as r:
            xml = r.read().decode()
        total_urls = len(re.findall(r'<loc>(https://[^<]+)</loc>', xml))
    except:
        total_urls = 259  # Fallback

    # Also pull per-language stats
    lang_stats = {}
    for lang in ['en', 'zh', 'es', 'ar']:
        if lang == 'en':
            lang_pages = [p for p in indexed_pages if '/zh/' not in p and '/es/' not in p and '/ar/' not in p]
        else:
            prefix = f'{SITE}/tools/{lang}/'
            lang_pages = [p for p in indexed_pages if prefix in p or f'/{lang}/' in p]
        lang_stats[lang] = len(lang_pages)

    return {
        'date': today.isoformat(),
        'total_sitemap_urls': total_urls,
        'indexed_pages': len(indexed_pages),
        'index_ratio': f'{len(indexed_pages)}/{total_urls} = {len(indexed_pages)*100//total_urls}%',
        'discovered_not_indexed': total_urls - len(indexed_pages),
        'countries': dict(sorted(countries.items(), key=lambda x: x[1], reverse=True)[:10]),
        'per_language': lang_stats,
        'total_impressions': sum(row.get('impressions', 0) for row in rows),
        'total_clicks': sum(row.get('clicks', 0) for row in rows),
    }

def check_sitemap_status():
    """Check sitemap submission status in GSC"""
    session = get_session(['https://www.googleapis.com/auth/webmasters.readonly'])
    resp = session.get(
        f'https://www.googleapis.com/webmasters/v3/sites/{SITE_URL}/sitemaps',
        timeout=30
    )
    if resp.status_code == 200:
        data = resp.json()
        sitemaps = data.get('sitemap', [])
        results = []
        for s in sitemaps:
            results.append({
                'path': s.get('path', ''),
                'submitted': s.get('lastSubmitted', 'N/A'),
                'downloaded': s.get('lastDownloaded', 'N/A'),
                'urls': s.get('contents', [{}])[0].get('submitted', 0) if s.get('contents') else 0,
                'warnings': s.get('warnings', 0),
                'errors': s.get('errors', 0),
            })
        return results
    return None

if __name__ == '__main__':
    print(f'=== GSC Index Harvest Monitor ===')
    print(f'Site: {SITE}')
    print(f'Time: {datetime.datetime.now().isoformat()}\n')

    if not os.path.exists(KEY_FILE):
        print('Service account key not found. Skipping GSC API.')
        sys.exit(0)

    # 1. Indexing status
    print('--- Index Status ---')
    stats = check_indexing_status()
    if stats:
        print(f'  Sitemap URLs:       {stats["total_sitemap_urls"]}')
        print(f'  Indexed (7d imp):   {stats["indexed_pages"]}')
        print(f'  Index Ratio:        {stats["index_ratio"]}')
        print(f'  Discovered/NotIdx:  {stats["discovered_not_indexed"]}')
        print(f'  Total Impressions:  {stats["total_impressions"]}')
        print(f'  Total Clicks:       {stats["total_clicks"]}')
        print(f'\n  Per Language:')
        for lang, count in stats['per_language'].items():
            print(f'    {lang}: {count} indexed pages')
        print(f'\n  Top Countries:')
        for country, imps in stats['countries'].items():
            print(f'    {country}: {imps} impressions')

    # 2. Sitemap status
    print(f'\n--- Sitemap Status ---')
    sm_status = check_sitemap_status()
    if sm_status:
        for s in sm_status:
            print(f'  {s["path"]}: {s["urls"]} URLs, {s.get("errors",0)} errors, {s.get("warnings",0)} warnings')

    print(f'\n=== Done ===')