mirror of
https://gitlab.com/Anson-Projects/zine.git
synced 2025-06-17 05:56:39 +00:00
Refactor and clean up codebase
This commit is contained in:
parent
fcd4248a3c
commit
2725245393
18
Cargo.lock
generated
18
Cargo.lock
generated
@ -25,14 +25,17 @@ dependencies = [
|
|||||||
"chrono",
|
"chrono",
|
||||||
"clippy",
|
"clippy",
|
||||||
"feed-rs",
|
"feed-rs",
|
||||||
|
"html-escape",
|
||||||
"log",
|
"log",
|
||||||
"maud",
|
"maud",
|
||||||
"rand",
|
"rand",
|
||||||
"rayon",
|
"rayon",
|
||||||
|
"regex",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"rss",
|
"rss",
|
||||||
"scraper",
|
"scraper",
|
||||||
"simple_logger",
|
"simple_logger",
|
||||||
|
"url",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -640,6 +643,15 @@ version = "0.3.9"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
|
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "html-escape"
|
||||||
|
version = "0.2.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
|
||||||
|
dependencies = [
|
||||||
|
"utf8-width",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "html5ever"
|
name = "html5ever"
|
||||||
version = "0.26.0"
|
version = "0.26.0"
|
||||||
@ -1970,6 +1982,12 @@ version = "0.7.6"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8-width"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "uuid"
|
name = "uuid"
|
||||||
version = "1.8.0"
|
version = "1.8.0"
|
||||||
|
21
Cargo.toml
21
Cargo.toml
@ -8,17 +8,20 @@ homepage = "https://zine.ansonbiggs.com"
|
|||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
feed-rs = "2.0"
|
|
||||||
reqwest = { version = "0.12", features = ["blocking"] }
|
|
||||||
maud = "0.26"
|
|
||||||
chrono = "0.4"
|
|
||||||
scraper = "0.19"
|
|
||||||
rayon = "1.8"
|
|
||||||
simple_logger = "5.0"
|
|
||||||
log = "0.4"
|
|
||||||
rss = "2.0"
|
|
||||||
anyhow = "1.0"
|
anyhow = "1.0"
|
||||||
|
chrono = "0.4"
|
||||||
|
feed-rs = "2.0"
|
||||||
|
html-escape = "0.2"
|
||||||
|
log = "0.4"
|
||||||
|
maud = "0.26"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
|
rayon = "1.8"
|
||||||
|
regex = "1.5"
|
||||||
|
reqwest = { version = "0.12", features = ["blocking"] }
|
||||||
|
rss = "2.0"
|
||||||
|
scraper = "0.19"
|
||||||
|
simple_logger = "5.0"
|
||||||
|
url = "2.4"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
clippy = "0.0.302"
|
clippy = "0.0.302"
|
||||||
|
129
src/main.rs
129
src/main.rs
@ -5,7 +5,6 @@ extern crate reqwest;
|
|||||||
|
|
||||||
use rand::seq::SliceRandom;
|
use rand::seq::SliceRandom;
|
||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::fs::write;
|
use std::fs::write;
|
||||||
use std::fs::DirBuilder;
|
use std::fs::DirBuilder;
|
||||||
@ -16,92 +15,100 @@ mod web_fetchers;
|
|||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
fn main() -> Result<(), Box<dyn Error>> {
|
fn main() -> Result<(), Box<dyn Error>> {
|
||||||
simple_logger::init_with_level(log::Level::Info).unwrap();
|
setup_logging()?;
|
||||||
|
|
||||||
let all_posts = utilities::read_feed("feeds.txt");
|
let posts = load_and_process_posts()?;
|
||||||
|
let archive_posts = create_archive_posts(&posts)?;
|
||||||
|
|
||||||
let mut posts = all_posts.clone();
|
generate_and_write_output(posts, archive_posts)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn setup_logging() -> Result<(), Box<dyn Error>> {
|
||||||
|
simple_logger::init_with_level(log::Level::Info)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn load_and_process_posts() -> Result<Vec<utilities::Post>, Box<dyn Error>> {
|
||||||
|
let mut posts = grab_posts()?;
|
||||||
|
process_posts(&mut posts)?;
|
||||||
|
Ok(posts)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn grab_posts() -> Result<Vec<utilities::Post>, Box<dyn Error>> {
|
||||||
|
let mut posts = utilities::read_feed("feeds.txt");
|
||||||
posts.retain(|post| post.score.is_positive());
|
posts.retain(|post| post.score.is_positive());
|
||||||
|
utilities::retain_most_recent_based_on_main_url(&mut posts);
|
||||||
// Keep only the first occurence of each main_url
|
|
||||||
{
|
|
||||||
let mut seen_urls = HashSet::new();
|
|
||||||
posts.retain(|post| seen_urls.insert(post.main_url.clone()));
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut featured = utilities::read_feed("featured_feeds.txt");
|
let mut featured = utilities::read_feed("featured_feeds.txt");
|
||||||
// Give featured a small boost in points
|
featured.iter_mut().for_each(|post| {
|
||||||
featured = featured
|
|
||||||
.iter_mut()
|
|
||||||
.map(|post| {
|
|
||||||
post.score = (post.score as f64 * 1.5) as i64;
|
post.score = (post.score as f64 * 1.5) as i64;
|
||||||
post.clone()
|
});
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
posts.extend(featured);
|
posts.extend(featured);
|
||||||
|
|
||||||
posts.par_iter_mut().for_each(utilities::find_image);
|
Ok(posts)
|
||||||
posts.par_iter_mut().for_each(utilities::validate);
|
}
|
||||||
|
|
||||||
|
fn process_posts(posts: &mut Vec<utilities::Post>) -> Result<(), Box<dyn Error>> {
|
||||||
|
posts.par_iter_mut().for_each(|post| {
|
||||||
|
utilities::find_image(post);
|
||||||
|
utilities::validate(post);
|
||||||
|
});
|
||||||
|
|
||||||
posts.sort();
|
posts.sort();
|
||||||
|
|
||||||
// Move the post with an image_url to the head of the list
|
|
||||||
if let Some(pos) = posts.iter().position(|post| post.image_url.is_some()) {
|
if let Some(pos) = posts.iter().position(|post| post.image_url.is_some()) {
|
||||||
let post_with_image = posts.remove(pos);
|
let post_with_image = posts.remove(pos);
|
||||||
posts.insert(0, post_with_image);
|
posts.insert(0, post_with_image);
|
||||||
}
|
}
|
||||||
|
|
||||||
utilities::retain_first_main_url(&mut posts);
|
|
||||||
|
|
||||||
posts.truncate(16);
|
posts.truncate(16);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
let mut old_posts = all_posts;
|
fn create_archive_posts(posts: &[utilities::Post]) -> Result<Vec<utilities::Post>, Box<dyn Error>> {
|
||||||
|
const ARCHIVE_SIZE: usize = 100;
|
||||||
|
|
||||||
old_posts.retain(|p| !posts.contains(p));
|
let mut old_posts: Vec<_> = utilities::read_feed("feeds.txt")
|
||||||
|
.into_iter()
|
||||||
|
.filter(|p| !posts.contains(p))
|
||||||
|
.collect();
|
||||||
old_posts.shuffle(&mut thread_rng());
|
old_posts.shuffle(&mut thread_rng());
|
||||||
|
|
||||||
let mut archive_posts: Vec<utilities::Post> = Vec::new();
|
let mut archive_posts = Vec::new();
|
||||||
let archive_size = 100;
|
while archive_posts.len() < ARCHIVE_SIZE && !old_posts.is_empty() {
|
||||||
|
let chunk_size = std::cmp::min(ARCHIVE_SIZE - archive_posts.len() + 50, old_posts.len());
|
||||||
|
let mut chunk: Vec<_> = old_posts.drain(..chunk_size).collect();
|
||||||
|
|
||||||
while (archive_posts.len() < archive_size) && (old_posts.len() > 50) {
|
chunk.par_iter_mut().for_each(utilities::validate);
|
||||||
let iter_size = archive_size - archive_posts.len();
|
chunk.retain(|post| post.score != 0);
|
||||||
|
archive_posts.extend(chunk);
|
||||||
let mut extracted = old_posts
|
|
||||||
.drain(0..=(iter_size + 50))
|
|
||||||
.collect::<Vec<utilities::Post>>();
|
|
||||||
|
|
||||||
extracted.par_iter_mut().for_each(utilities::validate);
|
|
||||||
extracted.retain(|post| post.score != 0);
|
|
||||||
|
|
||||||
archive_posts.extend(extracted);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
archive_posts.truncate(archive_size);
|
archive_posts.truncate(ARCHIVE_SIZE);
|
||||||
|
Ok(archive_posts)
|
||||||
|
}
|
||||||
|
|
||||||
let index = site_generator::generate_index(posts.clone(), archive_posts.clone());
|
fn generate_and_write_output(
|
||||||
let index_path = Path::new("output/index.html");
|
posts: Vec<utilities::Post>,
|
||||||
DirBuilder::new()
|
archive_posts: Vec<utilities::Post>,
|
||||||
.recursive(true)
|
) -> Result<(), Box<dyn Error>> {
|
||||||
.create(index_path.parent().unwrap())
|
let index = site_generator::generate_index(posts.clone(), archive_posts);
|
||||||
.unwrap();
|
write_to_file("output/index.html", index.into_string())?;
|
||||||
|
|
||||||
match write(index_path, index.into_string()) {
|
let feed = site_generator::generate_rss(posts);
|
||||||
Ok(_) => log::info!("Successfully wrote to {}", index_path.display()),
|
write_to_file("output/feed.xml", feed)?;
|
||||||
Err(e) => log::error!("Failed to write to {}: {}", index_path.display(), e),
|
|
||||||
}
|
|
||||||
|
|
||||||
let feed = site_generator::generate_rss(posts.clone());
|
|
||||||
let feed_path = Path::new("output/feed.xml");
|
|
||||||
DirBuilder::new()
|
|
||||||
.recursive(true)
|
|
||||||
.create(feed_path.parent().unwrap())
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
match write(feed_path, feed) {
|
|
||||||
Ok(_) => log::info!("Successfully wrote to {}", feed_path.display()),
|
|
||||||
Err(e) => log::error!("Failed to write to {}: {}", feed_path.display(), e),
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn write_to_file<P: AsRef<Path>>(path: P, content: String) -> Result<(), Box<dyn Error>> {
|
||||||
|
let path = path.as_ref();
|
||||||
|
DirBuilder::new()
|
||||||
|
.recursive(true)
|
||||||
|
.create(path.parent().unwrap())?;
|
||||||
|
write(path, content)?;
|
||||||
|
log::info!("Successfully wrote to {}", path.display());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
@ -6,8 +6,11 @@ use scraper::{Html, Selector};
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use html_escape::decode_html_entities;
|
||||||
|
use regex::Regex;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Eq)]
|
#[derive(Clone, PartialEq, Eq)]
|
||||||
pub struct Post {
|
pub struct Post {
|
||||||
@ -38,7 +41,8 @@ impl Post {
|
|||||||
let title = entry
|
let title = entry
|
||||||
.title
|
.title
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map_or_else(|| "".to_string(), |t| t.content.clone());
|
.map(|t| decode_html_entities(&t.content).into_owned())
|
||||||
|
.unwrap_or_default();
|
||||||
|
|
||||||
let link = entry
|
let link = entry
|
||||||
.links
|
.links
|
||||||
@ -74,10 +78,14 @@ impl Post {
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
let cleaned_description = strip_html_tags(&description);
|
let cleaned_description = strip_html_and_css_content(&description);
|
||||||
let truncated_description = truncate_description(&cleaned_description, 500);
|
let truncated_description = truncate_description(&cleaned_description, 500);
|
||||||
|
|
||||||
let main_url = get_root_url(link.href.as_str());
|
let main_url = Url::parse(&link.href)
|
||||||
|
.map_err(|_| anyhow::anyhow!("Failed to parse URL: {}", link.href))?
|
||||||
|
.host_str()
|
||||||
|
.map(String::from)
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("No host in URL: {}", link.href))?;
|
||||||
|
|
||||||
let mut score = (date - (chrono::Utc::now() - chrono::Duration::days(21))).num_minutes();
|
let mut score = (date - (chrono::Utc::now() - chrono::Duration::days(21))).num_minutes();
|
||||||
|
|
||||||
@ -148,15 +156,6 @@ pub fn get_entry_date(entry: &Entry) -> DateTime<Utc> {
|
|||||||
entry.published.unwrap_or(entry.updated.unwrap_or_default())
|
entry.published.unwrap_or(entry.updated.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_root_url(input_url: &str) -> String {
|
|
||||||
let mut url = input_url;
|
|
||||||
|
|
||||||
url = url.strip_prefix("https://").unwrap_or(url);
|
|
||||||
url = url.strip_prefix("http://").unwrap_or(url);
|
|
||||||
|
|
||||||
url.split_once('/').unwrap().0.to_string()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn truncate_description(description: &str, max_length: usize) -> String {
|
pub fn truncate_description(description: &str, max_length: usize) -> String {
|
||||||
let description_trimmed = description.trim();
|
let description_trimmed = description.trim();
|
||||||
if description_trimmed.len() > max_length {
|
if description_trimmed.len() > max_length {
|
||||||
@ -173,8 +172,17 @@ pub fn truncate_description(description: &str, max_length: usize) -> String {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn strip_html_tags(html: &str) -> String {
|
pub fn strip_html_and_css_content(input: &str) -> String {
|
||||||
let document = Html::parse_document(html);
|
// First, remove CSS content
|
||||||
|
let css_regex = Regex::new(r"<style[^>]*>[\s\S]*?</style>").unwrap();
|
||||||
|
let without_css = css_regex.replace_all(input, "");
|
||||||
|
|
||||||
|
// Then, remove inline CSS
|
||||||
|
let inline_css_regex = Regex::new("\\s*style\\s*=\\s*\"[^\"]*\"").unwrap();
|
||||||
|
let without_inline_css = inline_css_regex.replace_all(&without_css, "");
|
||||||
|
|
||||||
|
// Parse the remaining HTML and extract text
|
||||||
|
let document = Html::parse_document(&without_inline_css);
|
||||||
let selector = Selector::parse("*").unwrap();
|
let selector = Selector::parse("*").unwrap();
|
||||||
let mut text_content = String::new();
|
let mut text_content = String::new();
|
||||||
|
|
||||||
@ -184,7 +192,11 @@ pub fn strip_html_tags(html: &str) -> String {
|
|||||||
text_content.push(' ');
|
text_content.push(' ');
|
||||||
}
|
}
|
||||||
|
|
||||||
text_content.trim().to_string()
|
// Remove any remaining CSS-like content (for cases where it's not in a <style> tag)
|
||||||
|
let final_css_regex = Regex::new(r"\.[a-zA-Z0-9_-]+\s*\{[^}]*\}").unwrap();
|
||||||
|
let final_text = final_css_regex.replace_all(&text_content, "");
|
||||||
|
|
||||||
|
final_text.trim().to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn group_by_nth<T: Clone>(slice: &[T], n: usize) -> Vec<Vec<T>> {
|
pub fn group_by_nth<T: Clone>(slice: &[T], n: usize) -> Vec<Vec<T>> {
|
||||||
@ -264,7 +276,7 @@ pub fn validate(post: &mut Post) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn retain_first_main_url(posts: &mut Vec<Post>) {
|
pub fn retain_most_recent_based_on_main_url(posts: &mut Vec<Post>) {
|
||||||
let mut seen_urls = HashSet::new();
|
let mut seen_urls = HashSet::new();
|
||||||
posts.retain(|post| seen_urls.insert(post.main_url.clone()));
|
posts.retain(|post| seen_urls.insert(post.main_url.clone()));
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user