246 lines
8.1 KiB
Python
246 lines
8.1 KiB
Python
import urllib.request
|
|
import urllib.error
|
|
import re
|
|
import ssl
|
|
import time
|
|
import xml.etree.ElementTree as ET
|
|
from html.parser import HTMLParser
|
|
|
|
ctx = ssl.create_default_context()
|
|
ctx.check_hostname = False
|
|
ctx.verify_mode = ssl.CERT_NONE
|
|
|
|
BAD_GITHUB_PATTERN = "github.com/liudapao880807-arch"
|
|
|
|
results = []
|
|
|
|
def log(msg):
|
|
print(msg)
|
|
results.append(msg)
|
|
|
|
class LinkExtractor(HTMLParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.hrefs = []
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag == "a":
|
|
for k, v in attrs:
|
|
if k == "href" and v:
|
|
self.hrefs.append(v)
|
|
|
|
def fetch(url, timeout=30):
|
|
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
|
|
try:
|
|
resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
|
|
code = resp.getcode()
|
|
body = resp.read().decode("utf-8", errors="replace")
|
|
return code, body
|
|
except urllib.error.HTTPError as e:
|
|
return e.code, e.read().decode("utf-8", errors="replace") if e.fp else ""
|
|
except Exception as e:
|
|
return -1, str(e)
|
|
|
|
def check_url_status(url, timeout=20):
|
|
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": "Mozilla/5.0"})
|
|
try:
|
|
resp = urllib.request.urlopen(req, timeout=timeout, context=ctx)
|
|
return resp.getcode()
|
|
except urllib.error.HTTPError as e:
|
|
return e.code
|
|
except Exception:
|
|
return -1
|
|
|
|
def extract_links(html):
|
|
parser = LinkExtractor()
|
|
try:
|
|
parser.feed(html)
|
|
except:
|
|
pass
|
|
return parser.hrefs
|
|
|
|
def resolve_link(base_url, href):
|
|
if href.startswith("http://") or href.startswith("https://"):
|
|
return href
|
|
if href.startswith("//"):
|
|
return "https:" + href
|
|
if href.startswith("/"):
|
|
from urllib.parse import urlparse
|
|
p = urlparse(base_url)
|
|
return f"{p.scheme}://{p.netloc}{href}"
|
|
return href
|
|
|
|
def check_page(url, label):
|
|
log(f"\n{'='*80}")
|
|
log(f"[{label}] Checking: {url}")
|
|
code, body = fetch(url)
|
|
log(f" Status: {code}")
|
|
log(f" Size: {len(body)} chars")
|
|
|
|
if code != 200:
|
|
log(f" *** ERROR: Non-200 status code!")
|
|
return []
|
|
|
|
content_stripped = re.sub(r'<[^>]+>', '', body).strip()
|
|
word_count = len(content_stripped.split())
|
|
log(f" Word count (approx): {word_count}")
|
|
if word_count < 50:
|
|
log(f" *** WARNING: Very low content — possibly blank or broken page!")
|
|
|
|
bad_links = re.findall(BAD_GITHUB_PATTERN, body)
|
|
if bad_links:
|
|
log(f" *** BAD LINKS FOUND: github.com/liudapao880807-arch appears {len(bad_links)} time(s)!")
|
|
for m in re.finditer(r'href="[^"]*liudapao880807-arch[^"]*"', body):
|
|
log(f" -> {m.group()}")
|
|
else:
|
|
log(f" No github.com/liudapao880807-arch links (good)")
|
|
|
|
hrefs = extract_links(body)
|
|
log(f" Total links found: {len(hrefs)}")
|
|
|
|
internal = []
|
|
external = []
|
|
for h in hrefs:
|
|
resolved = resolve_link(url, h)
|
|
if "blog.quant-view.xyz" in resolved:
|
|
internal.append(resolved)
|
|
elif resolved.startswith("http"):
|
|
external.append(resolved)
|
|
|
|
log(f" Internal links: {len(internal)}")
|
|
log(f" External links: {len(external)}")
|
|
|
|
all_unique = list(set(internal + external))
|
|
broken = []
|
|
ok = []
|
|
errors_detail = []
|
|
|
|
for link in all_unique:
|
|
if link.endswith((".css", ".js", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".woff", ".woff2", ".ttf", ".eot")):
|
|
continue
|
|
if "mailto:" in link or "javascript:" in link:
|
|
continue
|
|
time.sleep(0.3)
|
|
status = check_url_status(link)
|
|
if status == -1:
|
|
broken.append((link, "CONNECTION_ERROR"))
|
|
errors_detail.append(f" BROKEN (connection error): {link}")
|
|
elif status >= 400:
|
|
broken.append((link, status))
|
|
errors_detail.append(f" BROKEN ({status}): {link}")
|
|
else:
|
|
ok.append((link, status))
|
|
|
|
log(f" Checked {len(all_unique)} unique non-asset links")
|
|
log(f" OK: {len(ok)}")
|
|
log(f" Broken/Errors: {len(broken)}")
|
|
for d in errors_detail:
|
|
log(d)
|
|
|
|
return broken
|
|
|
|
def check_sitemap(url, label):
|
|
log(f"\n{'='*80}")
|
|
log(f"[{label}] Checking sitemap: {url}")
|
|
code, body = fetch(url)
|
|
log(f" Status: {code}")
|
|
if code != 200:
|
|
log(f" *** ERROR: Sitemap not accessible!")
|
|
return
|
|
|
|
urls_in_sitemap = []
|
|
try:
|
|
root = ET.fromstring(body)
|
|
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
for loc in root.findall(".//sm:loc", ns):
|
|
urls_in_sitemap.append(loc.text.strip())
|
|
for loc in root.findall(".//sm:url/sm:loc", ns):
|
|
if loc.text.strip() not in urls_in_sitemap:
|
|
urls_in_sitemap.append(loc.text.strip())
|
|
except ET.ParseError:
|
|
urls_found = re.findall(r'<loc>\s*(https?://[^<]+)\s*</loc>', body)
|
|
urls_in_sitemap = urls_found
|
|
|
|
log(f" URLs in sitemap: {len(urls_in_sitemap)}")
|
|
|
|
if len(urls_in_sitemap) == 0:
|
|
log(f" *** WARNING: No URLs found in sitemap!")
|
|
else:
|
|
sample = urls_in_sitemap[:5]
|
|
for u in sample:
|
|
log(f" Sample: {u}")
|
|
|
|
broken_count = 0
|
|
for u in urls_in_sitemap[:20]:
|
|
time.sleep(0.2)
|
|
status = check_url_status(u)
|
|
if status >= 400 or status == -1:
|
|
log(f" BROKEN in sitemap ({status}): {u}")
|
|
broken_count += 1
|
|
if broken_count == 0:
|
|
log(f" First 20 sitemap URLs all OK")
|
|
else:
|
|
log(f" {broken_count} broken URLs in first 20 sitemap entries")
|
|
|
|
article_urls = [
|
|
("Article", "https://blog.quant-view.xyz/position-size-calculator-guide.html"),
|
|
("Article", "https://blog.quant-view.xyz/gold-trading-2026-guide.html"),
|
|
("Article", "https://blog.quant-view.xyz/ssh-tunnel-deployment-china.html"),
|
|
("Article", "https://blog.quant-view.xyz/github-seo-trading-tools.html"),
|
|
("Article", "https://blog.quant-view.xyz/gold-pip-value-calculator-wrong.html"),
|
|
("Article", "https://blog.quant-view.xyz/bloomberg-alternative.html"),
|
|
("Article", "https://blog.quant-view.xyz/why-retail-traders-lose-money.html"),
|
|
("Article", "https://blog.quant-view.xyz/order-flow-trading.html"),
|
|
]
|
|
|
|
tool_urls = [
|
|
("Tool", "https://blog.quant-view.xyz/tools/position-size-formula.html"),
|
|
("Tool", "https://blog.quant-view.xyz/tools/xauusd-trading-guide.html"),
|
|
("Tool", "https://blog.quant-view.xyz/tools/btc-position-size-calculator.html"),
|
|
("Tool", "https://blog.quant-view.xyz/tools/risk-management-guide.html"),
|
|
("Tool", "https://blog.quant-view.xyz/tools/tradingview-vs-mt5.html"),
|
|
("Tool", "https://blog.quant-view.xyz/tools/kelly-criterion-formula.html"),
|
|
("Tool", "https://blog.quant-view.xyz/tools/forex-position-size-calculator.html"),
|
|
("Tool", "https://blog.quant-view.xyz/tools/pip-calculator-eurgbp.html"),
|
|
("Tool", "https://blog.quant-view.xyz/tools/sp500-position-size-calculator.html"),
|
|
("Tool", "https://blog.quant-view.xyz/tools/position-size-calculator-100000-dollar-account.html"),
|
|
]
|
|
|
|
sitemap_urls = [
|
|
("Sitemap", "https://blog.quant-view.xyz/sitemap.xml"),
|
|
("Sitemap", "https://blog.quant-view.xyz/sitemap-tools.xml"),
|
|
("Sitemap", "https://blog.quant-view.xyz/sitemap-posts.xml"),
|
|
]
|
|
|
|
log("=" * 80)
|
|
log("BLOG AUDIT: https://blog.quant-view.xyz")
|
|
log(f"Date: 2026-06-28")
|
|
log("=" * 80)
|
|
|
|
all_broken = []
|
|
all_bad_github = []
|
|
|
|
for label, url in article_urls:
|
|
broken = check_page(url, label)
|
|
all_broken.extend(broken)
|
|
|
|
for label, url in tool_urls:
|
|
broken = check_page(url, label)
|
|
all_broken.extend(broken)
|
|
|
|
for label, url in sitemap_urls:
|
|
check_sitemap(url, label)
|
|
|
|
log("\n" + "=" * 80)
|
|
log("SUMMARY")
|
|
log("=" * 80)
|
|
|
|
log(f"\nTotal broken links found: {len(all_broken)}")
|
|
for link, status in all_broken:
|
|
log(f" [{status}] {link}")
|
|
|
|
output = "\n".join(results)
|
|
with open(r"D:\GFIL_BLOG\audit_results.txt", "w", encoding="utf-8") as f:
|
|
f.write(output)
|
|
|
|
log(f"\nResults written to D:\\GFIL_BLOG\\audit_results.txt")
|