diff --git a/Cargo.lock b/Cargo.lock index 63a7754..0cc4078 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -25,14 +25,17 @@ dependencies = [ "chrono", "clippy", "feed-rs", + "html-escape", "log", "maud", "rand", "rayon", + "regex", "reqwest", "rss", "scraper", "simple_logger", + "url", ] [[package]] @@ -640,6 +643,15 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + [[package]] name = "html5ever" version = "0.26.0" @@ -1970,6 +1982,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3" + [[package]] name = "uuid" version = "1.8.0" diff --git a/Cargo.toml b/Cargo.toml index a1be58a..38c33c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,17 +8,20 @@ homepage = "https://zine.ansonbiggs.com" license = "MIT" [dependencies] -feed-rs = "2.0" -reqwest = { version = "0.12", features = ["blocking"] } -maud = "0.26" -chrono = "0.4" -scraper = "0.19" -rayon = "1.8" -simple_logger = "5.0" -log = "0.4" -rss = "2.0" anyhow = "1.0" +chrono = "0.4" +feed-rs = "2.0" +html-escape = "0.2" +log = "0.4" +maud = "0.26" rand = "0.8" +rayon = "1.8" +regex = "1.5" +reqwest = { version = "0.12", features = ["blocking"] } +rss = "2.0" +scraper = "0.19" +simple_logger = "5.0" +url = "2.4" [dev-dependencies] clippy = "0.0.302" diff --git a/src/main.rs b/src/main.rs index fbad2aa..352c69d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,7 +5,6 @@ extern crate reqwest; use rand::seq::SliceRandom; use rand::thread_rng; -use std::collections::HashSet; use std::error::Error; use std::fs::write; use std::fs::DirBuilder; @@ -16,92 +15,100 @@ mod web_fetchers; use rayon::prelude::*; fn main() -> Result<(), Box> { - simple_logger::init_with_level(log::Level::Info).unwrap(); + setup_logging()?; - let all_posts = utilities::read_feed("feeds.txt"); + let posts = load_and_process_posts()?; + let archive_posts = create_archive_posts(&posts)?; - let mut posts = all_posts.clone(); + generate_and_write_output(posts, archive_posts)?; + + Ok(()) +} + +fn setup_logging() -> Result<(), Box> { + simple_logger::init_with_level(log::Level::Info)?; + Ok(()) +} + +fn load_and_process_posts() -> Result, Box> { + let mut posts = grab_posts()?; + process_posts(&mut posts)?; + Ok(posts) +} + +fn grab_posts() -> Result, Box> { + let mut posts = utilities::read_feed("feeds.txt"); posts.retain(|post| post.score.is_positive()); - - // Keep only the first occurence of each main_url - { - let mut seen_urls = HashSet::new(); - posts.retain(|post| seen_urls.insert(post.main_url.clone())); - } + utilities::retain_most_recent_based_on_main_url(&mut posts); let mut featured = utilities::read_feed("featured_feeds.txt"); - // Give featured a small boost in points - featured = featured - .iter_mut() - .map(|post| { - post.score = (post.score as f64 * 1.5) as i64; - post.clone() - }) - .collect::>(); - + featured.iter_mut().for_each(|post| { + post.score = (post.score as f64 * 1.5) as i64; + }); posts.extend(featured); - posts.par_iter_mut().for_each(utilities::find_image); - posts.par_iter_mut().for_each(utilities::validate); + Ok(posts) +} + +fn process_posts(posts: &mut Vec) -> Result<(), Box> { + posts.par_iter_mut().for_each(|post| { + utilities::find_image(post); + utilities::validate(post); + }); posts.sort(); - // Move the post with an image_url to the head of the list if let Some(pos) = posts.iter().position(|post| post.image_url.is_some()) { let post_with_image = posts.remove(pos); posts.insert(0, post_with_image); } - utilities::retain_first_main_url(&mut posts); - posts.truncate(16); + Ok(()) +} - let mut old_posts = all_posts; +fn create_archive_posts(posts: &[utilities::Post]) -> Result, Box> { + const ARCHIVE_SIZE: usize = 100; - old_posts.retain(|p| !posts.contains(p)); + let mut old_posts: Vec<_> = utilities::read_feed("feeds.txt") + .into_iter() + .filter(|p| !posts.contains(p)) + .collect(); old_posts.shuffle(&mut thread_rng()); - let mut archive_posts: Vec = Vec::new(); - let archive_size = 100; + let mut archive_posts = Vec::new(); + while archive_posts.len() < ARCHIVE_SIZE && !old_posts.is_empty() { + let chunk_size = std::cmp::min(ARCHIVE_SIZE - archive_posts.len() + 50, old_posts.len()); + let mut chunk: Vec<_> = old_posts.drain(..chunk_size).collect(); - while (archive_posts.len() < archive_size) && (old_posts.len() > 50) { - let iter_size = archive_size - archive_posts.len(); - - let mut extracted = old_posts - .drain(0..=(iter_size + 50)) - .collect::>(); - - extracted.par_iter_mut().for_each(utilities::validate); - extracted.retain(|post| post.score != 0); - - archive_posts.extend(extracted); + chunk.par_iter_mut().for_each(utilities::validate); + chunk.retain(|post| post.score != 0); + archive_posts.extend(chunk); } - archive_posts.truncate(archive_size); + archive_posts.truncate(ARCHIVE_SIZE); + Ok(archive_posts) +} - let index = site_generator::generate_index(posts.clone(), archive_posts.clone()); - let index_path = Path::new("output/index.html"); - DirBuilder::new() - .recursive(true) - .create(index_path.parent().unwrap()) - .unwrap(); +fn generate_and_write_output( + posts: Vec, + archive_posts: Vec, +) -> Result<(), Box> { + let index = site_generator::generate_index(posts.clone(), archive_posts); + write_to_file("output/index.html", index.into_string())?; - match write(index_path, index.into_string()) { - Ok(_) => log::info!("Successfully wrote to {}", index_path.display()), - Err(e) => log::error!("Failed to write to {}: {}", index_path.display(), e), - } - - let feed = site_generator::generate_rss(posts.clone()); - let feed_path = Path::new("output/feed.xml"); - DirBuilder::new() - .recursive(true) - .create(feed_path.parent().unwrap()) - .unwrap(); - - match write(feed_path, feed) { - Ok(_) => log::info!("Successfully wrote to {}", feed_path.display()), - Err(e) => log::error!("Failed to write to {}: {}", feed_path.display(), e), - } + let feed = site_generator::generate_rss(posts); + write_to_file("output/feed.xml", feed)?; Ok(()) } + +fn write_to_file>(path: P, content: String) -> Result<(), Box> { + let path = path.as_ref(); + DirBuilder::new() + .recursive(true) + .create(path.parent().unwrap())?; + write(path, content)?; + log::info!("Successfully wrote to {}", path.display()); + Ok(()) +} diff --git a/src/utilities.rs b/src/utilities.rs index 68031ba..4582de3 100644 --- a/src/utilities.rs +++ b/src/utilities.rs @@ -6,8 +6,11 @@ use scraper::{Html, Selector}; use std::collections::HashSet; use anyhow::Result; +use html_escape::decode_html_entities; +use regex::Regex; use std::cmp::Ordering; use std::fs; +use url::Url; #[derive(Clone, PartialEq, Eq)] pub struct Post { @@ -38,7 +41,8 @@ impl Post { let title = entry .title .as_ref() - .map_or_else(|| "".to_string(), |t| t.content.clone()); + .map(|t| decode_html_entities(&t.content).into_owned()) + .unwrap_or_default(); let link = entry .links @@ -74,10 +78,14 @@ impl Post { }, ); - let cleaned_description = strip_html_tags(&description); + let cleaned_description = strip_html_and_css_content(&description); let truncated_description = truncate_description(&cleaned_description, 500); - let main_url = get_root_url(link.href.as_str()); + let main_url = Url::parse(&link.href) + .map_err(|_| anyhow::anyhow!("Failed to parse URL: {}", link.href))? + .host_str() + .map(String::from) + .ok_or_else(|| anyhow::anyhow!("No host in URL: {}", link.href))?; let mut score = (date - (chrono::Utc::now() - chrono::Duration::days(21))).num_minutes(); @@ -148,15 +156,6 @@ pub fn get_entry_date(entry: &Entry) -> DateTime { entry.published.unwrap_or(entry.updated.unwrap_or_default()) } -pub fn get_root_url(input_url: &str) -> String { - let mut url = input_url; - - url = url.strip_prefix("https://").unwrap_or(url); - url = url.strip_prefix("http://").unwrap_or(url); - - url.split_once('/').unwrap().0.to_string() -} - pub fn truncate_description(description: &str, max_length: usize) -> String { let description_trimmed = description.trim(); if description_trimmed.len() > max_length { @@ -173,8 +172,17 @@ pub fn truncate_description(description: &str, max_length: usize) -> String { } } -pub fn strip_html_tags(html: &str) -> String { - let document = Html::parse_document(html); +pub fn strip_html_and_css_content(input: &str) -> String { + // First, remove CSS content + let css_regex = Regex::new(r"]*>[\s\S]*?").unwrap(); + let without_css = css_regex.replace_all(input, ""); + + // Then, remove inline CSS + let inline_css_regex = Regex::new("\\s*style\\s*=\\s*\"[^\"]*\"").unwrap(); + let without_inline_css = inline_css_regex.replace_all(&without_css, ""); + + // Parse the remaining HTML and extract text + let document = Html::parse_document(&without_inline_css); let selector = Selector::parse("*").unwrap(); let mut text_content = String::new(); @@ -184,7 +192,11 @@ pub fn strip_html_tags(html: &str) -> String { text_content.push(' '); } - text_content.trim().to_string() + // Remove any remaining CSS-like content (for cases where it's not in a