Replace iframe with direct HTML content extraction

- Extract article content from ghost-optimized pages - Add extract_article_content() function with fallback to iframe - Try multiple selectors to find main content area - Provide graceful fallbacks for failed content extraction - Remove unused variables and fix warnings
2025-09-14 09:35:04 +00:00 · 2025-08-21 23:24:53 -06:00
parent e233a96f55
commit cdb96a50b7
1 changed files with 67 additions and 6 deletions
--- a/ghost-upload/src/main.rs
+++ b/ghost-upload/src/main.rs
@@ -45,13 +45,29 @@ impl Post {
        let slug = get_slug(link);

        let summary = summarize_url(link).await;
+        
+        // Extract content from ghost-optimized version
+        let ghost_content = extract_article_content(&link).await;
+        
        let html = html! {
+            div class="ghost-summary" {
+                h3 { "Summary" }
                p { (summary) }
-            iframe src=(link) style="width: 100%; height: 80vh" { }
+            }
+            div class="ghost-content" {
+                (maud::PreEscaped(ghost_content))
+            }
+            div class="ghost-footer" {
+                hr {}
                p {
-                "This content was originally posted on my projects website " a href=(link) { "here." }
-                " The above summary was made by the " a href=("https://help.kagi.com/kagi/api/summarizer.html")
-                {"Kagi Summarizer"}
+                    em {
+                        "This content was originally posted on my projects website " 
+                        a href=(link) { "here" }
+                        ". The above summary was generated by the " 
+                        a href=("https://help.kagi.com/kagi/api/summarizer.html") {"Kagi Summarizer"} 
+                        "."
+                    }
+                }
            }
        }.into_string();

@@ -130,6 +146,51 @@ fn get_slug(link: &str) -> String {
    link.split_once("/posts/").unwrap().1.to_string()
 }

+async fn extract_article_content(original_link: &str) -> String {
+    // Convert original link to ghost-content version
+    let ghost_link = original_link.replace("projects.ansonbiggs.com", "projects.ansonbiggs.com/ghost-content");
+    
+    match reqwest::get(&ghost_link).await {
+        Ok(response) => {
+            match response.text().await {
+                Ok(html_content) => {
+                    let document = Html::parse_document(&html_content);
+                    
+                    // Try different selectors to find the main content
+                    let content_selectors = [
+                        "#quarto-content main",
+                        "#quarto-content",
+                        "main",
+                        "article",
+                        ".content",
+                        "body"
+                    ];
+                    
+                    for selector_str in &content_selectors {
+                        if let Ok(selector) = Selector::parse(selector_str) {
+                            if let Some(element) = document.select(&selector).next() {
+                                let content = element.inner_html();
+                                
+                                if !content.trim().is_empty() {
+                                    return content;
+                                }
+                            }
+                        }
+                    }
+                    
+                    // Fallback: return original content with iframe if extraction fails
+                    format!(r#"<div class="fallback-iframe">
+                        <p><em>Content extraction failed. Falling back to embedded view:</em></p>
+                        <iframe src="{}" style="width: 100%; height: 80vh; border: none;" loading="lazy"></iframe>
+                    </div>"#, original_link)
+                }
+                Err(_) => format!(r#"<p><em>Failed to fetch content. <a href="{}">View original post</a></em></p>"#, original_link)
+            }
+        }
+        Err(_) => format!(r#"<p><em>Failed to fetch content. <a href="{}">View original post</a></em></p>"#, original_link)
+    }
+}
+
 async fn check_if_post_exists(entry: &Entry) -> bool {
    let posts_url = "https://notes.ansonbiggs.com/";
    let link = entry.links.first().unwrap().href.as_str();