1
0
mirror of https://gitlab.com/Anson-Projects/projects.git synced 2025-09-14 09:35:04 +00:00

Replace iframe with direct HTML content extraction

- Extract article content from ghost-optimized pages
- Add extract_article_content() function with fallback to iframe
- Try multiple selectors to find main content area
- Provide graceful fallbacks for failed content extraction
- Remove unused variables and fix warnings
This commit is contained in:
2025-08-21 23:24:53 -06:00
parent e233a96f55
commit cdb96a50b7

View File

@@ -45,13 +45,29 @@ impl Post {
let slug = get_slug(link);
let summary = summarize_url(link).await;
// Extract content from ghost-optimized version
let ghost_content = extract_article_content(&link).await;
let html = html! {
div class="ghost-summary" {
h3 { "Summary" }
p { (summary) }
iframe src=(link) style="width: 100%; height: 80vh" { }
}
div class="ghost-content" {
(maud::PreEscaped(ghost_content))
}
div class="ghost-footer" {
hr {}
p {
"This content was originally posted on my projects website " a href=(link) { "here." }
" The above summary was made by the " a href=("https://help.kagi.com/kagi/api/summarizer.html")
{"Kagi Summarizer"}
em {
"This content was originally posted on my projects website "
a href=(link) { "here" }
". The above summary was generated by the "
a href=("https://help.kagi.com/kagi/api/summarizer.html") {"Kagi Summarizer"}
"."
}
}
}
}.into_string();
@@ -130,6 +146,51 @@ fn get_slug(link: &str) -> String {
link.split_once("/posts/").unwrap().1.to_string()
}
async fn extract_article_content(original_link: &str) -> String {
// Convert original link to ghost-content version
let ghost_link = original_link.replace("projects.ansonbiggs.com", "projects.ansonbiggs.com/ghost-content");
match reqwest::get(&ghost_link).await {
Ok(response) => {
match response.text().await {
Ok(html_content) => {
let document = Html::parse_document(&html_content);
// Try different selectors to find the main content
let content_selectors = [
"#quarto-content main",
"#quarto-content",
"main",
"article",
".content",
"body"
];
for selector_str in &content_selectors {
if let Ok(selector) = Selector::parse(selector_str) {
if let Some(element) = document.select(&selector).next() {
let content = element.inner_html();
if !content.trim().is_empty() {
return content;
}
}
}
}
// Fallback: return original content with iframe if extraction fails
format!(r#"<div class="fallback-iframe">
<p><em>Content extraction failed. Falling back to embedded view:</em></p>
<iframe src="{}" style="width: 100%; height: 80vh; border: none;" loading="lazy"></iframe>
</div>"#, original_link)
}
Err(_) => format!(r#"<p><em>Failed to fetch content. <a href="{}">View original post</a></em></p>"#, original_link)
}
}
Err(_) => format!(r#"<p><em>Failed to fetch content. <a href="{}">View original post</a></em></p>"#, original_link)
}
}
async fn check_if_post_exists(entry: &Entry) -> bool {
let posts_url = "https://notes.ansonbiggs.com/";
let link = entry.links.first().unwrap().href.as_str();