Refactor and clean up codebase

2025-08-01 11:11:32 +00:00 · 2024-07-01 22:46:59 -06:00
parent fcd4248a3c
commit 2725245393
4 changed files with 127 additions and 87 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -25,14 +25,17 @@ dependencies = [
 "chrono",
 "clippy",
 "feed-rs",
+ "html-escape",
 "log",
 "maud",
 "rand",
 "rayon",
+ "regex",
 "reqwest",
 "rss",
 "scraper",
 "simple_logger",
+ "url",
 ]

 [[package]]
@@ -640,6 +643,15 @@ version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"

+[[package]]
+name = "html-escape"
+version = "0.2.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
+dependencies = [
+ "utf8-width",
+]
+
 [[package]]
 name = "html5ever"
 version = "0.26.0"
@@ -1970,6 +1982,12 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"

+[[package]]
+name = "utf8-width"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
+
 [[package]]
 name = "uuid"
 version = "1.8.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,17 +8,20 @@ homepage = "https://zine.ansonbiggs.com"
 license = "MIT"

 [dependencies]
-feed-rs = "2.0"
-reqwest = { version = "0.12", features = ["blocking"] }
-maud = "0.26"
-chrono = "0.4"
-scraper = "0.19"
-rayon = "1.8"
-simple_logger = "5.0"
-log = "0.4"
-rss = "2.0"
 anyhow = "1.0"
+chrono = "0.4"
+feed-rs = "2.0"
+html-escape = "0.2"
+log = "0.4"
+maud = "0.26"
 rand = "0.8"
+rayon = "1.8"
+regex = "1.5"
+reqwest = { version = "0.12", features = ["blocking"] }
+rss = "2.0"
+scraper = "0.19"
+simple_logger = "5.0"
+url = "2.4"

 [dev-dependencies]
 clippy = "0.0.302"
--- a/src/main.rs
+++ b/src/main.rs
@@ -5,7 +5,6 @@ extern crate reqwest;

 use rand::seq::SliceRandom;
 use rand::thread_rng;
-use std::collections::HashSet;
 use std::error::Error;
 use std::fs::write;
 use std::fs::DirBuilder;
@@ -16,92 +15,100 @@ mod web_fetchers;
 use rayon::prelude::*;

 fn main() -> Result<(), Box<dyn Error>> {
-    simple_logger::init_with_level(log::Level::Info).unwrap();
+    setup_logging()?;

-    let all_posts = utilities::read_feed("feeds.txt");
+    let posts = load_and_process_posts()?;
+    let archive_posts = create_archive_posts(&posts)?;

-    let mut posts = all_posts.clone();
+    generate_and_write_output(posts, archive_posts)?;
+
+    Ok(())
+}
+
+fn setup_logging() -> Result<(), Box<dyn Error>> {
+    simple_logger::init_with_level(log::Level::Info)?;
+    Ok(())
+}
+
+fn load_and_process_posts() -> Result<Vec<utilities::Post>, Box<dyn Error>> {
+    let mut posts = grab_posts()?;
+    process_posts(&mut posts)?;
+    Ok(posts)
+}
+
+fn grab_posts() -> Result<Vec<utilities::Post>, Box<dyn Error>> {
+    let mut posts = utilities::read_feed("feeds.txt");
    posts.retain(|post| post.score.is_positive());
-
-    // Keep only the first occurence of each main_url
-    {
-        let mut seen_urls = HashSet::new();
-        posts.retain(|post| seen_urls.insert(post.main_url.clone()));
-    }
+    utilities::retain_most_recent_based_on_main_url(&mut posts);

    let mut featured = utilities::read_feed("featured_feeds.txt");
-    // Give featured a small boost in points
-    featured = featured
-        .iter_mut()
-        .map(|post| {
-            post.score = (post.score as f64 * 1.5) as i64;
-            post.clone()
-        })
-        .collect::<Vec<_>>();
-
+    featured.iter_mut().for_each(|post| {
+        post.score = (post.score as f64 * 1.5) as i64;
+    });
    posts.extend(featured);

-    posts.par_iter_mut().for_each(utilities::find_image);
-    posts.par_iter_mut().for_each(utilities::validate);
+    Ok(posts)
+}
+
+fn process_posts(posts: &mut Vec<utilities::Post>) -> Result<(), Box<dyn Error>> {
+    posts.par_iter_mut().for_each(|post| {
+        utilities::find_image(post);
+        utilities::validate(post);
+    });

    posts.sort();

-    // Move the post with an image_url to the head of the list
    if let Some(pos) = posts.iter().position(|post| post.image_url.is_some()) {
        let post_with_image = posts.remove(pos);
        posts.insert(0, post_with_image);
    }

-    utilities::retain_first_main_url(&mut posts);
-
    posts.truncate(16);
+    Ok(())
+}

-    let mut old_posts = all_posts;
+fn create_archive_posts(posts: &[utilities::Post]) -> Result<Vec<utilities::Post>, Box<dyn Error>> {
+    const ARCHIVE_SIZE: usize = 100;

-    old_posts.retain(|p| !posts.contains(p));
+    let mut old_posts: Vec<_> = utilities::read_feed("feeds.txt")
+        .into_iter()
+        .filter(|p| !posts.contains(p))
+        .collect();
    old_posts.shuffle(&mut thread_rng());

-    let mut archive_posts: Vec<utilities::Post> = Vec::new();
-    let archive_size = 100;
+    let mut archive_posts = Vec::new();
+    while archive_posts.len() < ARCHIVE_SIZE && !old_posts.is_empty() {
+        let chunk_size = std::cmp::min(ARCHIVE_SIZE - archive_posts.len() + 50, old_posts.len());
+        let mut chunk: Vec<_> = old_posts.drain(..chunk_size).collect();

-    while (archive_posts.len() < archive_size) && (old_posts.len() > 50) {
-        let iter_size = archive_size - archive_posts.len();
-
-        let mut extracted = old_posts
-            .drain(0..=(iter_size + 50))
-            .collect::<Vec<utilities::Post>>();
-
-        extracted.par_iter_mut().for_each(utilities::validate);
-        extracted.retain(|post| post.score != 0);
-
-        archive_posts.extend(extracted);
+        chunk.par_iter_mut().for_each(utilities::validate);
+        chunk.retain(|post| post.score != 0);
+        archive_posts.extend(chunk);
    }

-    archive_posts.truncate(archive_size);
+    archive_posts.truncate(ARCHIVE_SIZE);
+    Ok(archive_posts)
+}

-    let index = site_generator::generate_index(posts.clone(), archive_posts.clone());
-    let index_path = Path::new("output/index.html");
-    DirBuilder::new()
-        .recursive(true)
-        .create(index_path.parent().unwrap())
-        .unwrap();
+fn generate_and_write_output(
+    posts: Vec<utilities::Post>,
+    archive_posts: Vec<utilities::Post>,
+) -> Result<(), Box<dyn Error>> {
+    let index = site_generator::generate_index(posts.clone(), archive_posts);
+    write_to_file("output/index.html", index.into_string())?;

-    match write(index_path, index.into_string()) {
-        Ok(_) => log::info!("Successfully wrote to {}", index_path.display()),
-        Err(e) => log::error!("Failed to write to {}: {}", index_path.display(), e),
-    }
-
-    let feed = site_generator::generate_rss(posts.clone());
-    let feed_path = Path::new("output/feed.xml");
-    DirBuilder::new()
-        .recursive(true)
-        .create(feed_path.parent().unwrap())
-        .unwrap();
-
-    match write(feed_path, feed) {
-        Ok(_) => log::info!("Successfully wrote to {}", feed_path.display()),
-        Err(e) => log::error!("Failed to write to {}: {}", feed_path.display(), e),
-    }
+    let feed = site_generator::generate_rss(posts);
+    write_to_file("output/feed.xml", feed)?;

    Ok(())
 }
+
+fn write_to_file<P: AsRef<Path>>(path: P, content: String) -> Result<(), Box<dyn Error>> {
+    let path = path.as_ref();
+    DirBuilder::new()
+        .recursive(true)
+        .create(path.parent().unwrap())?;
+    write(path, content)?;
+    log::info!("Successfully wrote to {}", path.display());
+    Ok(())
+}
--- a/src/utilities.rs
+++ b/src/utilities.rs
@@ -6,8 +6,11 @@ use scraper::{Html, Selector};
 use std::collections::HashSet;

 use anyhow::Result;
+use html_escape::decode_html_entities;
+use regex::Regex;
 use std::cmp::Ordering;
 use std::fs;
+use url::Url;

 #[derive(Clone, PartialEq, Eq)]
 pub struct Post {
@@ -38,7 +41,8 @@ impl Post {
        let title = entry
            .title
            .as_ref()
-            .map_or_else(|| "".to_string(), |t| t.content.clone());
+            .map(|t| decode_html_entities(&t.content).into_owned())
+            .unwrap_or_default();

        let link = entry
            .links
@@ -74,10 +78,14 @@ impl Post {
            },
        );

-        let cleaned_description = strip_html_tags(&description);
+        let cleaned_description = strip_html_and_css_content(&description);
        let truncated_description = truncate_description(&cleaned_description, 500);

-        let main_url = get_root_url(link.href.as_str());
+        let main_url = Url::parse(&link.href)
+            .map_err(|_| anyhow::anyhow!("Failed to parse URL: {}", link.href))?
+            .host_str()
+            .map(String::from)
+            .ok_or_else(|| anyhow::anyhow!("No host in URL: {}", link.href))?;

        let mut score = (date - (chrono::Utc::now() - chrono::Duration::days(21))).num_minutes();

@@ -148,15 +156,6 @@ pub fn get_entry_date(entry: &Entry) -> DateTime<Utc> {
    entry.published.unwrap_or(entry.updated.unwrap_or_default())
 }

-pub fn get_root_url(input_url: &str) -> String {
-    let mut url = input_url;
-
-    url = url.strip_prefix("https://").unwrap_or(url);
-    url = url.strip_prefix("http://").unwrap_or(url);
-
-    url.split_once('/').unwrap().0.to_string()
-}
-
 pub fn truncate_description(description: &str, max_length: usize) -> String {
    let description_trimmed = description.trim();
    if description_trimmed.len() > max_length {
@@ -173,8 +172,17 @@ pub fn truncate_description(description: &str, max_length: usize) -> String {
    }
 }

-pub fn strip_html_tags(html: &str) -> String {
-    let document = Html::parse_document(html);
+pub fn strip_html_and_css_content(input: &str) -> String {
+    // First, remove CSS content
+    let css_regex = Regex::new(r"<style[^>]*>[\s\S]*?</style>").unwrap();
+    let without_css = css_regex.replace_all(input, "");
+
+    // Then, remove inline CSS
+    let inline_css_regex = Regex::new("\\s*style\\s*=\\s*\"[^\"]*\"").unwrap();
+    let without_inline_css = inline_css_regex.replace_all(&without_css, "");
+
+    // Parse the remaining HTML and extract text
+    let document = Html::parse_document(&without_inline_css);
    let selector = Selector::parse("*").unwrap();
    let mut text_content = String::new();

@@ -184,7 +192,11 @@ pub fn strip_html_tags(html: &str) -> String {
        text_content.push(' ');
    }

-    text_content.trim().to_string()
+    // Remove any remaining CSS-like content (for cases where it's not in a <style> tag)
+    let final_css_regex = Regex::new(r"\.[a-zA-Z0-9_-]+\s*\{[^}]*\}").unwrap();
+    let final_text = final_css_regex.replace_all(&text_content, "");
+
+    final_text.trim().to_string()
 }

 pub fn group_by_nth<T: Clone>(slice: &[T], n: usize) -> Vec<Vec<T>> {
@@ -264,7 +276,7 @@ pub fn validate(post: &mut Post) {
    }
 }

-pub fn retain_first_main_url(posts: &mut Vec<Post>) {
+pub fn retain_most_recent_based_on_main_url(posts: &mut Vec<Post>) {
    let mut seen_urls = HashSet::new();
    posts.retain(|post| seen_urls.insert(post.main_url.clone()));
 }