From 4c82106817e9f14a3af301bc3d06915c5c14b743 Mon Sep 17 00:00:00 2001 From: Anson Biggs Date: Wed, 15 May 2024 04:44:27 +0000 Subject: [PATCH] Add Archive Posts --- Cargo.lock | 5 +- Cargo.toml | 3 +- README.md | 2 +- featured_feeds.txt | 3 +- feeds.txt | 1 + src/main.rs | 76 ++++++++++++++++---------- src/site_generator.rs | 123 +++++++++++++++++++++++++++--------------- src/utilities.rs | 38 ++++++++++--- src/web_fetchers.rs | 6 +++ 9 files changed, 172 insertions(+), 85 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2e251e..7713214 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -27,6 +27,7 @@ dependencies = [ "feed-rs", "log", "maud", + "rand", "rayon", "reqwest", "rss", @@ -1610,9 +1611,9 @@ dependencies = [ [[package]] name = "simple_logger" -version = "4.3.3" +version = "5.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e7e46c8c90251d47d08b28b8a419ffb4aede0f87c2eea95e17d1d5bacbf3ef1" +checksum = "e8c5dfa5e08767553704aa0ffd9d9794d527103c736aba9854773851fd7497eb" dependencies = [ "colored", "log", diff --git a/Cargo.toml b/Cargo.toml index ad8ed77..db7b317 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,10 +14,11 @@ maud = "0.26" chrono = "0.4" scraper = "0.19" rayon = "1.8" -simple_logger = "4.3" +simple_logger = "5.0" log = "0.4" rss = "2.0" anyhow = "1.0" +rand = "0.8" [dev-dependencies] clippy = "0.0.302" diff --git a/README.md b/README.md index 631513c..ea83ba6 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Anson's Blogroll Zine -Anson's Blogroll Zine is a Rust application designed to aggregate content from multiple RSS feeds, creating a personalized news page. It fetches articles from `feeds.txt` and `featured.txt`, generates HTML cards for each entry, and outputs a single, styled HTML page. +Anson's Blogroll Zine is a Rust application designed to aggregate content from multiple RSS feeds, creating a personalized news page. It fetches articles from `feeds.txt` and `featured.txt`, generates HTML cards for each post, and outputs a single, styled HTML page. ## Algorithm diff --git a/featured_feeds.txt b/featured_feeds.txt index 9380a0b..1d6db40 100644 --- a/featured_feeds.txt +++ b/featured_feeds.txt @@ -1,2 +1 @@ -https://ciechanow.ski/atom.xml -https://factorio.com/blog/rss \ No newline at end of file +https://ciechanow.ski/atom.xml \ No newline at end of file diff --git a/feeds.txt b/feeds.txt index f0b8be5..6db7288 100644 --- a/feeds.txt +++ b/feeds.txt @@ -76,6 +76,7 @@ https://www.brendangregg.com/blog/rss.xml https://www.doscher.com/rss/ https://www.elidedbranches.com/feeds/posts/default https://www.evanjones.ca/index.rss +https://www.factorio.com/blog/rss https://www.jeffgeerling.com/blog.xml https://www.joelonsoftware.com/feed/ https://www.makerstations.io/rss/ diff --git a/src/main.rs b/src/main.rs index 6dbfef4..768cc78 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,6 +3,8 @@ extern crate feed_rs; extern crate maud; extern crate reqwest; +use rand::seq::SliceRandom; +use rand::thread_rng; use std::error::Error; use std::fs::write; use std::fs::DirBuilder; @@ -16,54 +18,70 @@ use std::collections::HashMap; fn main() -> Result<(), Box> { simple_logger::init_with_level(log::Level::Info).unwrap(); - let mut entries = utilities::read_feed("feeds.txt"); + let all_posts = utilities::read_feed("feeds.txt"); - entries.retain(|entry| entry.score.is_positive()); + let mut posts = all_posts.clone(); + posts.retain(|post| post.score.is_positive()); - // Count occurences of main urls - let url_counts = entries.iter().fold(HashMap::new(), |mut acc, post| { + // Count occurences of main urls to punish blogs that post really frequently + // which also filters out blogs that make tiny updates and change the published date + let url_counts = posts.iter().fold(HashMap::new(), |mut acc, post| { *acc.entry(post.main_url.clone()).or_insert(0) += 1; acc }); - - // Punish blogs that post really often - entries.iter_mut().for_each(|entry| { - entry.score = (entry.score / url_counts.get(&entry.main_url).unwrap()) as i64; + posts.iter_mut().for_each(|post| { + post.score = (post.score / url_counts.get(&post.main_url).unwrap()) as i64; }); let mut featured = utilities::read_feed("featured_feeds.txt"); - + // Give featured a small boost in points featured = featured .iter_mut() .map(|post| { - post.score *= 1.5 as i64; + post.score = (post.score as f64 * 1.5) as i64; post.clone() }) .collect::>(); - entries.extend(featured); + posts.extend(featured); - entries.par_iter_mut().for_each(utilities::find_image); - entries.retain(|entry| entry.score.is_positive()); + posts.par_iter_mut().for_each(utilities::find_image); + posts.par_iter_mut().for_each(utilities::validate); - entries.sort(); + posts.sort(); - // Remove bottom 10% from list - entries.truncate(entries.len() - (entries.len() as f64 * 0.1).ceil() as usize); - - // Make sure first entry has an image since it is used as the featured post - let mut max_iter = 0; - while entries.first().unwrap().image_url.is_none() { - entries[0].score += -100; - entries.sort(); - - max_iter += 1; - if max_iter > 10000 { - break; - } + // Move the post with an image_url to the head of the list + if let Some(pos) = posts.iter().position(|post| post.image_url.is_some()) { + let post_with_image = posts.remove(pos); + posts.insert(0, post_with_image); } - let index = site_generator::generate_index(entries.clone()); + posts.truncate(16); + + let mut old_posts = all_posts; + + old_posts.retain(|p| !posts.contains(p)); + old_posts.shuffle(&mut thread_rng()); + + let mut archive_posts: Vec = Vec::new(); + let archive_size = 100; + + while (archive_posts.len() < archive_size) && (old_posts.len() > 50) { + let iter_size = archive_size - archive_posts.len(); + + let mut extracted = old_posts + .drain(0..=(iter_size + 50)) + .collect::>(); + + extracted.par_iter_mut().for_each(utilities::validate); + extracted.retain(|post| post.score != 0); + + archive_posts.extend(extracted); + } + + archive_posts.truncate(archive_size); + + let index = site_generator::generate_index(posts.clone(), archive_posts.clone()); let index_path = Path::new("output/index.html"); DirBuilder::new() .recursive(true) @@ -75,7 +93,7 @@ fn main() -> Result<(), Box> { Err(e) => log::error!("Failed to write to {}: {}", index_path.display(), e), } - let feed = site_generator::generate_rss(entries.clone()); + let feed = site_generator::generate_rss(posts.clone()); let feed_path = Path::new("output/feed.xml"); DirBuilder::new() .recursive(true) diff --git a/src/site_generator.rs b/src/site_generator.rs index 02bffe7..702a2f9 100644 --- a/src/site_generator.rs +++ b/src/site_generator.rs @@ -11,23 +11,23 @@ use rss::{ChannelBuilder, Item, ItemBuilder}; use crate::utilities; -fn create_featured_card(entry: &utilities::Post) -> Markup { +fn create_featured_card(post: &utilities::Post) -> Markup { html! { article class="featured" { header { - @if entry.image_url.is_some() { - img src=(entry.image_url.as_ref().unwrap()) alt="Featured image"; + @if post.image_url.is_some() { + img src=(post.image_url.as_ref().unwrap()) alt="Featured image"; } hgroup { - h2 { (entry.title) } - a href=(format!("http://{}", entry.main_url)) { (entry.main_url) } + h2 { (post.title) } + a href=(format!("http://{}", post.main_url)) { (post.main_url) } } } body { - p { (entry.truncated_description) } + p { (post.truncated_description) } } footer { - a class="grid" href=(entry.link) style="--pico-text-decoration: none;" { + a class="grid" href=(post.link) style="--pico-text-decoration: none;" { button class="outline primary" { "Read Featured Post" } } } @@ -35,24 +35,24 @@ fn create_featured_card(entry: &utilities::Post) -> Markup { } } -fn create_post_card(entry: &utilities::Post) -> Markup { +fn create_post_card(post: &utilities::Post) -> Markup { html! { article { header { hgroup { - h2 { (entry.title) } - a href=(format!("http://{}", entry.main_url)) { (entry.main_url) } + h2 { (post.title) } + a href=(format!("http://{}", post.main_url)) { (post.main_url) } } } body { - @if entry.image_url.is_some() { - img src=(entry.image_url.as_ref().unwrap()) alt="Entry image"; + @if post.image_url.is_some() { + img src=(post.image_url.as_ref().unwrap()); p; } - p { (entry.truncated_description) } + p { (post.truncated_description) } } footer { - a class="grid" href=(entry.link) style="--pico-text-decoration: none;" { + a class="grid" href=(post.link) style="--pico-text-decoration: none;" { button class="outline secondary" { "Read Post" } } } @@ -110,11 +110,11 @@ fn generate_header() -> Markup { } fn about_modal(entries: Vec) -> Markup { - // Get link for each entry, which is a blog post then, + // Get link for each post, which is a blog post then, // convert it to a url to the main page of the blog let mut links = entries .iter() - .map(|entry| entry.main_url.as_str()) + .map(|post| post.main_url.as_str()) .collect::>() .into_iter() .collect::>(); @@ -153,9 +153,64 @@ fn about_modal(entries: Vec) -> Markup { } } -pub fn generate_index(mut entries: Vec) -> Markup { - let featured = entries.first().unwrap().clone(); - entries.remove(0); +pub fn generate_head() -> Markup { + html! { + head { + title { "Anson's Zine | Public RSS Feed" } + meta charset="utf-8"; + meta name="viewport" content="width=device-width, initial-scale=1"; + meta name="description" content="Blogroll zine of RSS feeds for Anson" + link rel="apple-touch-icon" sizes="180x180" href="/favicon/apple-touch-icon.png"; + link rel="icon" type="image/png" sizes="32x32" href="/favicon/favicon-32x32.png"; + link rel="icon" type="image/png" sizes="16x16" href="/favicon/favicon-16x16.png"; + link rel="manifest" href="/favicon/site.webmanifest"; + link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.blue.min.css"; + link rel="stylesheet" href="style.css"; + + // Open Graph meta tags + meta property="og:title" content="Anson's Zine | Public RSS Feed"; + meta property="og:description" content="Blogroll zine of RSS feeds for Anson"; + meta property="og:url" content="https://blogroll.ansonbiggs.com"; + meta property="og:type" content="website"; + } + } +} + +fn generate_archive_table(posts: Vec) -> Markup { + html! { + table class="striped" { + thead { + tr { + th { "Title" } + th { "Date" } + } + } + tbody { + @for post in posts { + tr { + td { + a href=(post.link) { (post.title) } + br; + small { + a href=(format!("http://{}", post.main_url)) { (post.main_url) } + } + } + td { + (post.date.format("%B %d, %Y").to_string()) + } + } + } + } + } + } +} + +pub fn generate_index( + mut posts: Vec, + archive_posts: Vec, +) -> Markup { + let featured = posts.first().unwrap().clone(); + posts.remove(0); log::info!( "Featured article: {}, img: {:?}", featured.link, @@ -165,39 +220,23 @@ pub fn generate_index(mut entries: Vec) -> Markup { html! { (maud::DOCTYPE) html lang="en" { - head { - title { "Anson's Zine | Public RSS Feed" } - meta charset="utf-8"; - meta name="viewport" content="width=device-width, initial-scale=1"; - meta name="description" content="Blogroll zine of RSS feeds for Anson" - link rel="apple-touch-icon" sizes="180x180" href="/favicon/apple-touch-icon.png"; - link rel="icon" type="image/png" sizes="32x32" href="/favicon/favicon-32x32.png"; - link rel="icon" type="image/png" sizes="16x16" href="/favicon/favicon-16x16.png"; - link rel="manifest" href="/favicon/site.webmanifest"; - link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.blue.min.css"; - link rel="stylesheet" href="style.css"; - - // Open Graph meta tags - meta property="og:title" content="Anson's Zine | Public RSS Feed"; - meta property="og:description" content="Blogroll zine of RSS feeds for Anson"; - meta property="og:url" content="https://blogroll.ansonbiggs.com"; - meta property="og:type" content="website"; - } + {(generate_head())} body { main class="container" { {(generate_header())} {(create_featured_card(&featured))} div class="grid" { - @for column_entries in utilities::group_by_nth(&entries, 3) { + @for column_posts in utilities::group_by_nth(&posts, 3) { div { - @for entry in column_entries { - {(create_post_card(&entry))} + @for post in column_posts { + {(create_post_card(&post))} } } - } } + h2 {"Random Old Posts"} + {(generate_archive_table(archive_posts))} {(generate_footer())} - {(about_modal(entries))} + {(about_modal(posts))} script src="modal.js" {} script src="minimal-theme-switcher.js" {} }} diff --git a/src/utilities.rs b/src/utilities.rs index 10ba060..60e7810 100644 --- a/src/utilities.rs +++ b/src/utilities.rs @@ -1,4 +1,4 @@ -use crate::web_fetchers; +use crate::web_fetchers::{self, is_valid_url}; use chrono::{DateTime, Utc}; use feed_rs::model::Entry; use rayon::prelude::*; @@ -16,7 +16,7 @@ pub struct Post { pub image_url: Option, pub truncated_description: String, pub main_url: String, - pub score: i64, + pub score: i64, // Score values still very in flux } impl Ord for Post { @@ -203,24 +203,46 @@ pub fn group_by_nth(slice: &[T], n: usize) -> Vec> { .collect() } -pub fn find_image(entry: &mut Post) { - if let Some(image_url) = &entry.image_url { +pub fn find_image(post: &mut Post) { + if let Some(image_url) = &post.image_url { match web_fetchers::is_valid_image_url(image_url) { Ok(true) => {} _ => { - entry.image_url = None; + post.image_url = None; } } } else { - match web_fetchers::fetch_social_image(entry.link.clone()) { + match web_fetchers::fetch_social_image(post.link.clone()) { Ok(social_image_url) => { if web_fetchers::is_valid_image_url(&social_image_url).unwrap_or(false) { - entry.image_url = Some(social_image_url); + post.image_url = Some(social_image_url); } } Err(error) => { - log::warn!("{}: {}", error, entry.link.clone()); + post.score = (post.score as f64 * 0.9) as i64; + log::warn!("{}: {}", error, post.link.clone()); } } } } + +pub fn validate(post: &mut Post) { + if post.title.is_empty() { + post.score = 0; + return; + } + + if !post.lang.is_empty() && post.lang != "en" { + post.score = 0; + return; + } + + if post.truncated_description.is_empty() { + post.score = (post.score as f64 * 0.5) as i64; + }; + + if !is_valid_url(post.link.as_str()) { + post.score = 0; + println!("{} is not valid", post.link.as_str()); + }; +} diff --git a/src/web_fetchers.rs b/src/web_fetchers.rs index a7cf0ff..b49d43c 100644 --- a/src/web_fetchers.rs +++ b/src/web_fetchers.rs @@ -46,3 +46,9 @@ pub fn is_valid_image_url(url: &str) -> Result> ct.to_str().map_or(false, |s| s.starts_with("image/")) })) } + +pub fn is_valid_url(url: &str) -> bool { + let client = reqwest::blocking::Client::new(); + + client.get(url).send().is_ok() +}