import urllib.request import urllib.error import re import ssl import time import xml.etree.ElementTree as ET from html.parser import HTMLParser ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE BAD_GITHUB_PATTERN = "github.com/liudapao880807-arch" results = [] def log(msg): print(msg) results.append(msg) class LinkExtractor(HTMLParser): def __init__(self): super().__init__() self.hrefs = [] def handle_starttag(self, tag, attrs): if tag == "a": for k, v in attrs: if k == "href" and v: self.hrefs.append(v) def fetch(url, timeout=30): req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}) try: resp = urllib.request.urlopen(req, timeout=timeout, context=ctx) code = resp.getcode() body = resp.read().decode("utf-8", errors="replace") return code, body except urllib.error.HTTPError as e: return e.code, e.read().decode("utf-8", errors="replace") if e.fp else "" except Exception as e: return -1, str(e) def check_url_status(url, timeout=20): req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Mozilla/5.0"}) try: resp = urllib.request.urlopen(req, timeout=timeout, context=ctx) return resp.getcode() except urllib.error.HTTPError as e: return e.code except Exception: return -1 def extract_links(html): parser = LinkExtractor() try: parser.feed(html) except: pass return parser.hrefs def resolve_link(base_url, href): if href.startswith("http://") or href.startswith("https://"): return href if href.startswith("//"): return "https:" + href if href.startswith("/"): from urllib.parse import urlparse p = urlparse(base_url) return f"{p.scheme}://{p.netloc}{href}" return href def check_page(url, label): log(f"\n{'='*80}") log(f"[{label}] Checking: {url}") code, body = fetch(url) log(f" Status: {code}") log(f" Size: {len(body)} chars") if code != 200: log(f" *** ERROR: Non-200 status code!") return [] content_stripped = re.sub(r'<[^>]+>', '', body).strip() word_count = len(content_stripped.split()) log(f" Word count (approx): {word_count}") if word_count < 50: log(f" *** WARNING: Very low content — possibly blank or broken page!") bad_links = re.findall(BAD_GITHUB_PATTERN, body) if bad_links: log(f" *** BAD LINKS FOUND: github.com/liudapao880807-arch appears {len(bad_links)} time(s)!") for m in re.finditer(r'href="[^"]*liudapao880807-arch[^"]*"', body): log(f" -> {m.group()}") else: log(f" No github.com/liudapao880807-arch links (good)") hrefs = extract_links(body) log(f" Total links found: {len(hrefs)}") internal = [] external = [] for h in hrefs: resolved = resolve_link(url, h) if "blog.quant-view.xyz" in resolved: internal.append(resolved) elif resolved.startswith("http"): external.append(resolved) log(f" Internal links: {len(internal)}") log(f" External links: {len(external)}") all_unique = list(set(internal + external)) broken = [] ok = [] errors_detail = [] for link in all_unique: if link.endswith((".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".woff", ".woff2", ".ttf", ".eot")): continue if "mailto:" in link or "javascript:" in link: continue time.sleep(0.3) status = check_url_status(link) if status == -1: broken.append((link, "CONNECTION_ERROR")) errors_detail.append(f" BROKEN (connection error): {link}") elif status >= 400: broken.append((link, status)) errors_detail.append(f" BROKEN ({status}): {link}") else: ok.append((link, status)) log(f" Checked {len(all_unique)} unique non-asset links") log(f" OK: {len(ok)}") log(f" Broken/Errors: {len(broken)}") for d in errors_detail: log(d) return broken def check_sitemap(url, label): log(f"\n{'='*80}") log(f"[{label}] Checking sitemap: {url}") code, body = fetch(url) log(f" Status: {code}") if code != 200: log(f" *** ERROR: Sitemap not accessible!") return urls_in_sitemap = [] try: root = ET.fromstring(body) ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"} for loc in root.findall(".//sm:loc", ns): urls_in_sitemap.append(loc.text.strip()) for loc in root.findall(".//sm:url/sm:loc", ns): if loc.text.strip() not in urls_in_sitemap: urls_in_sitemap.append(loc.text.strip()) except ET.ParseError: urls_found = re.findall(r'\s*(https?://[^<]+)\s*', body) urls_in_sitemap = urls_found log(f" URLs in sitemap: {len(urls_in_sitemap)}") if len(urls_in_sitemap) == 0: log(f" *** WARNING: No URLs found in sitemap!") else: sample = urls_in_sitemap[:5] for u in sample: log(f" Sample: {u}") broken_count = 0 for u in urls_in_sitemap[:20]: time.sleep(0.2) status = check_url_status(u) if status >= 400 or status == -1: log(f" BROKEN in sitemap ({status}): {u}") broken_count += 1 if broken_count == 0: log(f" First 20 sitemap URLs all OK") else: log(f" {broken_count} broken URLs in first 20 sitemap entries") article_urls = [ ("Article", "https://blog.quant-view.xyz/position-size-calculator-guide.html"), ("Article", "https://blog.quant-view.xyz/gold-trading-2026-guide.html"), ("Article", "https://blog.quant-view.xyz/ssh-tunnel-deployment-china.html"), ("Article", "https://blog.quant-view.xyz/github-seo-trading-tools.html"), ("Article", "https://blog.quant-view.xyz/gold-pip-value-calculator-wrong.html"), ("Article", "https://blog.quant-view.xyz/bloomberg-alternative.html"), ("Article", "https://blog.quant-view.xyz/why-retail-traders-lose-money.html"), ("Article", "https://blog.quant-view.xyz/order-flow-trading.html"), ] tool_urls = [ ("Tool", "https://blog.quant-view.xyz/tools/position-size-formula.html"), ("Tool", "https://blog.quant-view.xyz/tools/xauusd-trading-guide.html"), ("Tool", "https://blog.quant-view.xyz/tools/btc-position-size-calculator.html"), ("Tool", "https://blog.quant-view.xyz/tools/risk-management-guide.html"), ("Tool", "https://blog.quant-view.xyz/tools/tradingview-vs-mt5.html"), ("Tool", "https://blog.quant-view.xyz/tools/kelly-criterion-formula.html"), ("Tool", "https://blog.quant-view.xyz/tools/forex-position-size-calculator.html"), ("Tool", "https://blog.quant-view.xyz/tools/pip-calculator-eurgbp.html"), ("Tool", "https://blog.quant-view.xyz/tools/sp500-position-size-calculator.html"), ("Tool", "https://blog.quant-view.xyz/tools/position-size-calculator-100000-dollar-account.html"), ] sitemap_urls = [ ("Sitemap", "https://blog.quant-view.xyz/sitemap.xml"), ("Sitemap", "https://blog.quant-view.xyz/sitemap-tools.xml"), ("Sitemap", "https://blog.quant-view.xyz/sitemap-posts.xml"), ] log("=" * 80) log("BLOG AUDIT: https://blog.quant-view.xyz") log(f"Date: 2026-06-28") log("=" * 80) all_broken = [] all_bad_github = [] for label, url in article_urls: broken = check_page(url, label) all_broken.extend(broken) for label, url in tool_urls: broken = check_page(url, label) all_broken.extend(broken) for label, url in sitemap_urls: check_sitemap(url, label) log("\n" + "=" * 80) log("SUMMARY") log("=" * 80) log(f"\nTotal broken links found: {len(all_broken)}") for link, status in all_broken: log(f" [{status}] {link}") output = "\n".join(results) with open(r"D:\GFIL_BLOG\audit_results.txt", "w", encoding="utf-8") as f: f.write(output) log(f"\nResults written to D:\\GFIL_BLOG\\audit_results.txt")