zine/tests/site_validation.rs

use rayon::prelude::*;
use scraper::{Html, Selector};
use std::collections::HashSet;
use std::fs;
use std::sync::Mutex;
use std::time::Duration;

fn read_generated_site() -> String {
    fs::read_to_string("output/index.html").expect(
        "Failed to read output/index.html - run `cargo run` first to generate the site",
    )
}

fn extract_links(html: &str) -> Vec<String> {
    let document = Html::parse_document(html);
    let selector = Selector::parse("a[href]").unwrap();

    document
        .select(&selector)
        .filter_map(|el| el.value().attr("href"))
        .map(String::from)
        .collect()
}

fn extract_images(html: &str) -> Vec<String> {
    let document = Html::parse_document(html);
    let selector = Selector::parse("img[src]").unwrap();

    document
        .select(&selector)
        .filter_map(|el| el.value().attr("src"))
        .map(String::from)
        .collect()
}

#[test]
fn test_site_was_generated() {
    assert!(
        fs::metadata("output/index.html").is_ok(),
        "output/index.html does not exist - run `cargo run` first"
    );
    assert!(
        fs::metadata("output/feed.xml").is_ok(),
        "output/feed.xml does not exist - run `cargo run` first"
    );
}

#[test]
fn test_all_links_are_valid_urls() {
    let html = read_generated_site();
    let links = extract_links(&html);

    assert!(!links.is_empty(), "No links found in generated site");

    let mut invalid_links = Vec::new();

    for link in &links {
        // Skip anchor links, mailto, and relative paths (which are valid)
        if link.starts_with('#')
            || link.starts_with("mailto:")
            || link.starts_with('/')
            || link.starts_with("./")
        {
            continue;
        }

        if url::Url::parse(link).is_err() {
            invalid_links.push(link.clone());
        }
    }

    if !invalid_links.is_empty() {
        for link in &invalid_links {
            eprintln!("Invalid URL: {}", link);
        }
        panic!(
            "Found {} invalid URLs in generated site",
            invalid_links.len()
        );
    }
}

#[test]
fn test_no_duplicate_links_in_main_content() {
    let html = read_generated_site();
    let links = extract_links(&html);

    // Filter to only article links (external http/https links)
    let article_links: Vec<&String> = links
        .iter()
        .filter(|l| l.starts_with("http://") || l.starts_with("https://"))
        .collect();

    let unique: HashSet<&String> = article_links.iter().cloned().collect();

    // Some duplication is expected (e.g., link appears in title and "read more")
    // but we shouldn't have excessive duplication
    let duplication_ratio = article_links.len() as f64 / unique.len() as f64;

    assert!(
        duplication_ratio < 3.0,
        "Too much link duplication: {} total links, {} unique (ratio: {:.2})",
        article_links.len(),
        unique.len(),
        duplication_ratio
    );
}

#[test]
fn test_no_broken_image_urls() {
    let html = read_generated_site();
    let images = extract_images(&html);

    let mut invalid_images = Vec::new();

    for img in &images {
        // Skip data URLs and relative paths (which are valid)
        if img.starts_with("data:") || img.starts_with("/") || img.starts_with("./") {
            continue;
        }

        if url::Url::parse(img).is_err() {
            invalid_images.push(img.clone());
        }
    }

    if !invalid_images.is_empty() {
        for img in &invalid_images {
            eprintln!("Invalid image URL: {}", img);
        }
        panic!(
            "Found {} invalid image URLs in generated site",
            invalid_images.len()
        );
    }
}

#[test]
fn test_feed_xml_is_valid() {
    let feed_content =
        fs::read_to_string("output/feed.xml").expect("Failed to read output/feed.xml");

    // Basic XML structure checks
    assert!(
        feed_content.contains("<?xml"),
        "Feed missing XML declaration"
    );
    assert!(
        feed_content.contains("<rss") || feed_content.contains("<feed"),
        "Feed missing RSS/Atom root element"
    );
    assert!(
        feed_content.contains("<item>") || feed_content.contains("<entry>"),
        "Feed has no items/entries"
    );
}

#[test]
fn test_html_has_required_meta_tags() {
    let html = read_generated_site();
    let document = Html::parse_document(&html);

    // Check for title
    let title_selector = Selector::parse("title").unwrap();
    assert!(
        document.select(&title_selector).next().is_some(),
        "Missing <title> tag"
    );

    // Check for viewport meta (mobile responsiveness)
    let viewport_selector = Selector::parse("meta[name='viewport']").unwrap();
    assert!(
        document.select(&viewport_selector).next().is_some(),
        "Missing viewport meta tag"
    );
}

#[test]
fn test_all_links_are_reachable() {
    let html = read_generated_site();
    let links = extract_links(&html);

    // Get unique external links
    let external_links: Vec<String> = links
        .into_iter()
        .filter(|l| l.starts_with("http://") || l.starts_with("https://"))
        .collect::<HashSet<_>>()
        .into_iter()
        .collect();

    let total_links = external_links.len();
    println!("Checking {} unique external links...", total_links);

    let broken_links: Mutex<Vec<(String, String)>> = Mutex::new(Vec::new());

    // Check all links in parallel
    external_links.par_iter().for_each(|link| {
        let client = reqwest::blocking::Client::builder()
            .timeout(Duration::from_secs(15))
            .user_agent("Zine-Link-Checker/1.0")
            .build()
            .expect("Failed to create HTTP client");

        let is_broken = match client.head(link).send() {
            Ok(response) => {
                let status = response.status();
                // Accept 2xx, 3xx, and 405 (method not allowed - some servers don't allow HEAD)
                if !status.is_success() && !status.is_redirection() && status.as_u16() != 405 {
                    // Try GET as fallback (some servers reject HEAD)
                    match client.get(link).send() {
                        Ok(get_response) => {
                            let get_status = get_response.status();
                            if !get_status.is_success() && !get_status.is_redirection() {
                                Some(get_status.to_string())
                            } else {
                                None
                            }
                        }
                        Err(e) => Some(e.to_string()),
                    }
                } else {
                    None
                }
            }
            Err(e) => Some(e.to_string()),
        };

        if let Some(error) = is_broken {
            broken_links.lock().unwrap().push((link.clone(), error));
        }
    });

    let broken = broken_links.into_inner().unwrap();

    if !broken.is_empty() {
        eprintln!("\nBroken links found:");
        for (link, error) in &broken {
            eprintln!("  {} - {}", link, error);
        }
        panic!(
            "Found {} broken links out of {} total",
            broken.len(),
            total_links
        );
    }

    println!("All {} links are reachable!", total_links);
}

#[test]
fn test_no_empty_titles_or_descriptions() {
    let html = read_generated_site();
    let document = Html::parse_document(&html);

    // Check article titles aren't empty
    let title_selector = Selector::parse("article h2, article h3, .post-title").unwrap();
    let titles: Vec<String> = document
        .select(&title_selector)
        .map(|el| el.text().collect::<String>().trim().to_string())
        .collect();

    let empty_titles: Vec<&String> = titles.iter().filter(|t| t.is_empty()).collect();

    assert!(
        empty_titles.is_empty(),
        "Found {} empty titles in the generated site",
        empty_titles.len()
    );
}