mirror of
https://gitlab.com/Anson-Projects/zine.git
synced 2026-06-03 19:30:25 +00:00
2592d8ac46
- Add exhaustive site validation tests (all links, images, meta tags) - Add link_host config field to fix feeds with broken URLs - Fix vitalik feed (rewrite vitalik.ca → vitalik.eth.limo) - Fix gzip-compressed feeds (evanjones.ca) with reqwest compression - Resolve relative URLs in feed entries against feed base URL - Switch to env_logger to filter html5ever warnings - Add site_health CI job (parallel, non-blocking) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
270 lines
7.8 KiB
Rust
270 lines
7.8 KiB
Rust
use rayon::prelude::*;
|
|
use scraper::{Html, Selector};
|
|
use std::collections::HashSet;
|
|
use std::fs;
|
|
use std::sync::Mutex;
|
|
use std::time::Duration;
|
|
|
|
fn read_generated_site() -> String {
|
|
fs::read_to_string("output/index.html").expect(
|
|
"Failed to read output/index.html - run `cargo run` first to generate the site",
|
|
)
|
|
}
|
|
|
|
fn extract_links(html: &str) -> Vec<String> {
|
|
let document = Html::parse_document(html);
|
|
let selector = Selector::parse("a[href]").unwrap();
|
|
|
|
document
|
|
.select(&selector)
|
|
.filter_map(|el| el.value().attr("href"))
|
|
.map(String::from)
|
|
.collect()
|
|
}
|
|
|
|
fn extract_images(html: &str) -> Vec<String> {
|
|
let document = Html::parse_document(html);
|
|
let selector = Selector::parse("img[src]").unwrap();
|
|
|
|
document
|
|
.select(&selector)
|
|
.filter_map(|el| el.value().attr("src"))
|
|
.map(String::from)
|
|
.collect()
|
|
}
|
|
|
|
#[test]
|
|
fn test_site_was_generated() {
|
|
assert!(
|
|
fs::metadata("output/index.html").is_ok(),
|
|
"output/index.html does not exist - run `cargo run` first"
|
|
);
|
|
assert!(
|
|
fs::metadata("output/feed.xml").is_ok(),
|
|
"output/feed.xml does not exist - run `cargo run` first"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_all_links_are_valid_urls() {
|
|
let html = read_generated_site();
|
|
let links = extract_links(&html);
|
|
|
|
assert!(!links.is_empty(), "No links found in generated site");
|
|
|
|
let mut invalid_links = Vec::new();
|
|
|
|
for link in &links {
|
|
// Skip anchor links, mailto, and relative paths (which are valid)
|
|
if link.starts_with('#')
|
|
|| link.starts_with("mailto:")
|
|
|| link.starts_with('/')
|
|
|| link.starts_with("./")
|
|
{
|
|
continue;
|
|
}
|
|
|
|
if url::Url::parse(link).is_err() {
|
|
invalid_links.push(link.clone());
|
|
}
|
|
}
|
|
|
|
if !invalid_links.is_empty() {
|
|
for link in &invalid_links {
|
|
eprintln!("Invalid URL: {}", link);
|
|
}
|
|
panic!(
|
|
"Found {} invalid URLs in generated site",
|
|
invalid_links.len()
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_duplicate_links_in_main_content() {
|
|
let html = read_generated_site();
|
|
let links = extract_links(&html);
|
|
|
|
// Filter to only article links (external http/https links)
|
|
let article_links: Vec<&String> = links
|
|
.iter()
|
|
.filter(|l| l.starts_with("http://") || l.starts_with("https://"))
|
|
.collect();
|
|
|
|
let unique: HashSet<&String> = article_links.iter().cloned().collect();
|
|
|
|
// Some duplication is expected (e.g., link appears in title and "read more")
|
|
// but we shouldn't have excessive duplication
|
|
let duplication_ratio = article_links.len() as f64 / unique.len() as f64;
|
|
|
|
assert!(
|
|
duplication_ratio < 3.0,
|
|
"Too much link duplication: {} total links, {} unique (ratio: {:.2})",
|
|
article_links.len(),
|
|
unique.len(),
|
|
duplication_ratio
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_broken_image_urls() {
|
|
let html = read_generated_site();
|
|
let images = extract_images(&html);
|
|
|
|
let mut invalid_images = Vec::new();
|
|
|
|
for img in &images {
|
|
// Skip data URLs and relative paths (which are valid)
|
|
if img.starts_with("data:") || img.starts_with("/") || img.starts_with("./") {
|
|
continue;
|
|
}
|
|
|
|
if url::Url::parse(img).is_err() {
|
|
invalid_images.push(img.clone());
|
|
}
|
|
}
|
|
|
|
if !invalid_images.is_empty() {
|
|
for img in &invalid_images {
|
|
eprintln!("Invalid image URL: {}", img);
|
|
}
|
|
panic!(
|
|
"Found {} invalid image URLs in generated site",
|
|
invalid_images.len()
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_feed_xml_is_valid() {
|
|
let feed_content =
|
|
fs::read_to_string("output/feed.xml").expect("Failed to read output/feed.xml");
|
|
|
|
// Basic XML structure checks
|
|
assert!(
|
|
feed_content.contains("<?xml"),
|
|
"Feed missing XML declaration"
|
|
);
|
|
assert!(
|
|
feed_content.contains("<rss") || feed_content.contains("<feed"),
|
|
"Feed missing RSS/Atom root element"
|
|
);
|
|
assert!(
|
|
feed_content.contains("<item>") || feed_content.contains("<entry>"),
|
|
"Feed has no items/entries"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_html_has_required_meta_tags() {
|
|
let html = read_generated_site();
|
|
let document = Html::parse_document(&html);
|
|
|
|
// Check for title
|
|
let title_selector = Selector::parse("title").unwrap();
|
|
assert!(
|
|
document.select(&title_selector).next().is_some(),
|
|
"Missing <title> tag"
|
|
);
|
|
|
|
// Check for viewport meta (mobile responsiveness)
|
|
let viewport_selector = Selector::parse("meta[name='viewport']").unwrap();
|
|
assert!(
|
|
document.select(&viewport_selector).next().is_some(),
|
|
"Missing viewport meta tag"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_all_links_are_reachable() {
|
|
let html = read_generated_site();
|
|
let links = extract_links(&html);
|
|
|
|
// Get unique external links
|
|
let external_links: Vec<String> = links
|
|
.into_iter()
|
|
.filter(|l| l.starts_with("http://") || l.starts_with("https://"))
|
|
.collect::<HashSet<_>>()
|
|
.into_iter()
|
|
.collect();
|
|
|
|
let total_links = external_links.len();
|
|
println!("Checking {} unique external links...", total_links);
|
|
|
|
let broken_links: Mutex<Vec<(String, String)>> = Mutex::new(Vec::new());
|
|
|
|
// Check all links in parallel
|
|
external_links.par_iter().for_each(|link| {
|
|
let client = reqwest::blocking::Client::builder()
|
|
.timeout(Duration::from_secs(15))
|
|
.user_agent("Zine-Link-Checker/1.0")
|
|
.build()
|
|
.expect("Failed to create HTTP client");
|
|
|
|
let is_broken = match client.head(link).send() {
|
|
Ok(response) => {
|
|
let status = response.status();
|
|
// Accept 2xx, 3xx, and 405 (method not allowed - some servers don't allow HEAD)
|
|
if !status.is_success() && !status.is_redirection() && status.as_u16() != 405 {
|
|
// Try GET as fallback (some servers reject HEAD)
|
|
match client.get(link).send() {
|
|
Ok(get_response) => {
|
|
let get_status = get_response.status();
|
|
if !get_status.is_success() && !get_status.is_redirection() {
|
|
Some(get_status.to_string())
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
Err(e) => Some(e.to_string()),
|
|
}
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
Err(e) => Some(e.to_string()),
|
|
};
|
|
|
|
if let Some(error) = is_broken {
|
|
broken_links.lock().unwrap().push((link.clone(), error));
|
|
}
|
|
});
|
|
|
|
let broken = broken_links.into_inner().unwrap();
|
|
|
|
if !broken.is_empty() {
|
|
eprintln!("\nBroken links found:");
|
|
for (link, error) in &broken {
|
|
eprintln!(" {} - {}", link, error);
|
|
}
|
|
panic!(
|
|
"Found {} broken links out of {} total",
|
|
broken.len(),
|
|
total_links
|
|
);
|
|
}
|
|
|
|
println!("All {} links are reachable!", total_links);
|
|
}
|
|
|
|
#[test]
|
|
fn test_no_empty_titles_or_descriptions() {
|
|
let html = read_generated_site();
|
|
let document = Html::parse_document(&html);
|
|
|
|
// Check article titles aren't empty
|
|
let title_selector = Selector::parse("article h2, article h3, .post-title").unwrap();
|
|
let titles: Vec<String> = document
|
|
.select(&title_selector)
|
|
.map(|el| el.text().collect::<String>().trim().to_string())
|
|
.collect();
|
|
|
|
let empty_titles: Vec<&String> = titles.iter().filter(|t| t.is_empty()).collect();
|
|
|
|
assert!(
|
|
empty_titles.is_empty(),
|
|
"Found {} empty titles in the generated site",
|
|
empty_titles.len()
|
|
);
|
|
}
|