Files
gfil-blog/audit.py

246 lines
8.1 KiB
Python

import urllib.request
import urllib.error
import re
import ssl
import time
import xml.etree.ElementTree as ET
from html.parser import HTMLParser
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
BAD_GITHUB_PATTERN = "github.com/liudapao880807-arch"
results = []
def log(msg):
print(msg)
results.append(msg)
class LinkExtractor(HTMLParser):
def __init__(self):
super().__init__()
self.hrefs = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for k, v in attrs:
if k == "href" and v:
self.hrefs.append(v)
def fetch(url, timeout=30):
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
try:
resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
code = resp.getcode()
body = resp.read().decode("utf-8", errors="replace")
return code, body
except urllib.error.HTTPError as e:
return e.code, e.read().decode("utf-8", errors="replace") if e.fp else ""
except Exception as e:
return -1, str(e)
def check_url_status(url, timeout=20):
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Mozilla/5.0"})
try:
resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
return resp.getcode()
except urllib.error.HTTPError as e:
return e.code
except Exception:
return -1
def extract_links(html):
parser = LinkExtractor()
try:
parser.feed(html)
except:
pass
return parser.hrefs
def resolve_link(base_url, href):
if href.startswith("http://") or href.startswith("https://"):
return href
if href.startswith("//"):
return "https:" + href
if href.startswith("/"):
from urllib.parse import urlparse
p = urlparse(base_url)
return f"{p.scheme}://{p.netloc}{href}"
return href
def check_page(url, label):
log(f"\n{'='*80}")
log(f"[{label}] Checking: {url}")
code, body = fetch(url)
log(f" Status: {code}")
log(f" Size: {len(body)} chars")
if code != 200:
log(f" *** ERROR: Non-200 status code!")
return []
content_stripped = re.sub(r'<[^>]+>', '', body).strip()
word_count = len(content_stripped.split())
log(f" Word count (approx): {word_count}")
if word_count < 50:
log(f" *** WARNING: Very low content — possibly blank or broken page!")
bad_links = re.findall(BAD_GITHUB_PATTERN, body)
if bad_links:
log(f" *** BAD LINKS FOUND: github.com/liudapao880807-arch appears {len(bad_links)} time(s)!")
for m in re.finditer(r'href="[^"]*liudapao880807-arch[^"]*"', body):
log(f" -> {m.group()}")
else:
log(f" No github.com/liudapao880807-arch links (good)")
hrefs = extract_links(body)
log(f" Total links found: {len(hrefs)}")
internal = []
external = []
for h in hrefs:
resolved = resolve_link(url, h)
if "blog.quant-view.xyz" in resolved:
internal.append(resolved)
elif resolved.startswith("http"):
external.append(resolved)
log(f" Internal links: {len(internal)}")
log(f" External links: {len(external)}")
all_unique = list(set(internal + external))
broken = []
ok = []
errors_detail = []
for link in all_unique:
if link.endswith((".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".woff", ".woff2", ".ttf", ".eot")):
continue
if "mailto:" in link or "javascript:" in link:
continue
time.sleep(0.3)
status = check_url_status(link)
if status == -1:
broken.append((link, "CONNECTION_ERROR"))
errors_detail.append(f" BROKEN (connection error): {link}")
elif status >= 400:
broken.append((link, status))
errors_detail.append(f" BROKEN ({status}): {link}")
else:
ok.append((link, status))
log(f" Checked {len(all_unique)} unique non-asset links")
log(f" OK: {len(ok)}")
log(f" Broken/Errors: {len(broken)}")
for d in errors_detail:
log(d)
return broken
def check_sitemap(url, label):
log(f"\n{'='*80}")
log(f"[{label}] Checking sitemap: {url}")
code, body = fetch(url)
log(f" Status: {code}")
if code != 200:
log(f" *** ERROR: Sitemap not accessible!")
return
urls_in_sitemap = []
try:
root = ET.fromstring(body)
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
for loc in root.findall(".//sm:loc", ns):
urls_in_sitemap.append(loc.text.strip())
for loc in root.findall(".//sm:url/sm:loc", ns):
if loc.text.strip() not in urls_in_sitemap:
urls_in_sitemap.append(loc.text.strip())
except ET.ParseError:
urls_found = re.findall(r'<loc>\s*(https?://[^<]+)\s*</loc>', body)
urls_in_sitemap = urls_found
log(f" URLs in sitemap: {len(urls_in_sitemap)}")
if len(urls_in_sitemap) == 0:
log(f" *** WARNING: No URLs found in sitemap!")
else:
sample = urls_in_sitemap[:5]
for u in sample:
log(f" Sample: {u}")
broken_count = 0
for u in urls_in_sitemap[:20]:
time.sleep(0.2)
status = check_url_status(u)
if status >= 400 or status == -1:
log(f" BROKEN in sitemap ({status}): {u}")
broken_count += 1
if broken_count == 0:
log(f" First 20 sitemap URLs all OK")
else:
log(f" {broken_count} broken URLs in first 20 sitemap entries")
article_urls = [
("Article", "https://blog.quant-view.xyz/position-size-calculator-guide.html"),
("Article", "https://blog.quant-view.xyz/gold-trading-2026-guide.html"),
("Article", "https://blog.quant-view.xyz/ssh-tunnel-deployment-china.html"),
("Article", "https://blog.quant-view.xyz/github-seo-trading-tools.html"),
("Article", "https://blog.quant-view.xyz/gold-pip-value-calculator-wrong.html"),
("Article", "https://blog.quant-view.xyz/bloomberg-alternative.html"),
("Article", "https://blog.quant-view.xyz/why-retail-traders-lose-money.html"),
("Article", "https://blog.quant-view.xyz/order-flow-trading.html"),
]
tool_urls = [
("Tool", "https://blog.quant-view.xyz/tools/position-size-formula.html"),
("Tool", "https://blog.quant-view.xyz/tools/xauusd-trading-guide.html"),
("Tool", "https://blog.quant-view.xyz/tools/btc-position-size-calculator.html"),
("Tool", "https://blog.quant-view.xyz/tools/risk-management-guide.html"),
("Tool", "https://blog.quant-view.xyz/tools/tradingview-vs-mt5.html"),
("Tool", "https://blog.quant-view.xyz/tools/kelly-criterion-formula.html"),
("Tool", "https://blog.quant-view.xyz/tools/forex-position-size-calculator.html"),
("Tool", "https://blog.quant-view.xyz/tools/pip-calculator-eurgbp.html"),
("Tool", "https://blog.quant-view.xyz/tools/sp500-position-size-calculator.html"),
("Tool", "https://blog.quant-view.xyz/tools/position-size-calculator-100000-dollar-account.html"),
]
sitemap_urls = [
("Sitemap", "https://blog.quant-view.xyz/sitemap.xml"),
("Sitemap", "https://blog.quant-view.xyz/sitemap-tools.xml"),
("Sitemap", "https://blog.quant-view.xyz/sitemap-posts.xml"),
]
log("=" * 80)
log("BLOG AUDIT: https://blog.quant-view.xyz")
log(f"Date: 2026-06-28")
log("=" * 80)
all_broken = []
all_bad_github = []
for label, url in article_urls:
broken = check_page(url, label)
all_broken.extend(broken)
for label, url in tool_urls:
broken = check_page(url, label)
all_broken.extend(broken)
for label, url in sitemap_urls:
check_sitemap(url, label)
log("\n" + "=" * 80)
log("SUMMARY")
log("=" * 80)
log(f"\nTotal broken links found: {len(all_broken)}")
for link, status in all_broken:
log(f" [{status}] {link}")
output = "\n".join(results)
with open(r"D:\GFIL_BLOG\audit_results.txt", "w", encoding="utf-8") as f:
f.write(output)
log(f"\nResults written to D:\\GFIL_BLOG\\audit_results.txt")