1
0
mirror of https://gitlab.com/Anson-Projects/zine.git synced 2025-06-15 21:26:38 +00:00

Update Main Feeds Algorithm to Only Keep First Occurence of Each main_url

This commit is contained in:
Anson Biggs 2024-05-31 23:34:24 -06:00
parent 015a1a7f19
commit 2b72ffadee

View File

@ -5,6 +5,7 @@ extern crate reqwest;
use rand::seq::SliceRandom; use rand::seq::SliceRandom;
use rand::thread_rng; use rand::thread_rng;
use std::collections::HashSet;
use std::error::Error; use std::error::Error;
use std::fs::write; use std::fs::write;
use std::fs::DirBuilder; use std::fs::DirBuilder;
@ -13,7 +14,6 @@ mod site_generator;
mod utilities; mod utilities;
mod web_fetchers; mod web_fetchers;
use rayon::prelude::*; use rayon::prelude::*;
use std::collections::HashMap;
fn main() -> Result<(), Box<dyn Error>> { fn main() -> Result<(), Box<dyn Error>> {
simple_logger::init_with_level(log::Level::Info).unwrap(); simple_logger::init_with_level(log::Level::Info).unwrap();
@ -23,15 +23,11 @@ fn main() -> Result<(), Box<dyn Error>> {
let mut posts = all_posts.clone(); let mut posts = all_posts.clone();
posts.retain(|post| post.score.is_positive()); posts.retain(|post| post.score.is_positive());
// Count occurences of main urls to punish blogs that post really frequently // Keep only the first occurence of each main_url
// which also filters out blogs that make tiny updates and change the published date {
let url_counts = posts.iter().fold(HashMap::new(), |mut acc, post| { let mut seen_urls = HashSet::new();
*acc.entry(post.main_url.clone()).or_insert(0) += 1; posts.retain(|post| seen_urls.insert(post.main_url.clone()));
acc }
});
posts.iter_mut().for_each(|post| {
post.score = (post.score / url_counts.get(&post.main_url).unwrap()) as i64;
});
let mut featured = utilities::read_feed("featured_feeds.txt"); let mut featured = utilities::read_feed("featured_feeds.txt");
// Give featured a small boost in points // Give featured a small boost in points