Debug CI artifacts to locate RSS feed file

Fix RSS feed parsing by reading from local artifacts
- Change fetch_feed to read from local file instead of HTTP request - Update feed path to use ../public/index.xml from GitLab CI artifacts - Add better error messages for file reading and parsing failures - Resolves ParseError(NoFeedRoot) by avoiding 404 from live website
2025-09-19 03:52:37 +00:00 · 2025-08-23 00:43:59 -06:00 · 2025-08-22 23:34:32 -06:00 · 2025-08-22 11:32:50 -07:00 · 2025-08-22 11:32:49 -07:00
6 changed files with 338 additions and 46 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -14,8 +14,10 @@ staging:
  stage: deploy
  image: ${CI_REGISTRY_IMAGE}:${CI_COMMIT_BRANCH}
  script:
-    - echo "Building the project with Quarto..."
+    - echo "Building the main website with Quarto..."
    - quarto render --to html --output-dir public
    - echo "Building Ghost-optimized version..."
    - quarto render --profile ghost --to html --output-dir public/ghost-content
  artifacts:
    paths:
      - public
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,7 +1,9 @@
 project:
  type: website
-website:
+profiles:
  default:
    website:
      title: "Anson's Projects"
      site-url: https://projects.ansonbiggs.com
      description: A Blog for Technical Topics
@@ -15,11 +17,26 @@ website:
          # - icon: gitlab
          #   href: https://gitlab.com/MisterBiggs
      open-graph: true
-format:
+    format:
      html:
        theme: zephyr
        css: styles.css
        # toc: true
  ghost:
    website:
      title: "Anson's Projects"
      site-url: https://projects.ansonbiggs.com
      description: A Blog for Technical Topics
      navbar: false
      open-graph: true
    format:
      html:
        theme: none
        css: ghost-iframe.css
        toc: false
        page-layout: article
        title-block-banner: false
 execute: 
  freeze: true
--- a/ghost-iframe.css
+++ b/ghost-iframe.css
@@ -0,0 +1,129 @@
 /* Ghost iframe optimized styles */
 body {
    font-family: system-ui, -apple-system, sans-serif;
    line-height: 1.6;
    color: #333;
    max-width: 100%;
    margin: 0;
    padding: 20px;
    background: white;
 }
 /* Remove any potential margins/padding */
 html, body {
    margin: 0;
    padding: 0;
    box-sizing: border-box;
 }
 /* Ensure content flows naturally */
 #quarto-content {
    max-width: none;
    padding: 0;
    margin: 0;
 }
 /* Style headings for Ghost */
 h1, h2, h3, h4, h5, h6 {
    margin-top: 1.5em;
    margin-bottom: 0.5em;
    font-weight: 600;
    line-height: 1.3;
 }
 h1 { font-size: 2em; }
 h2 { font-size: 1.5em; }
 h3 { font-size: 1.25em; }
 /* Code blocks */
 pre {
    background: #f8f9fa;
    border: 1px solid #e9ecef;
    border-radius: 6px;
    padding: 1rem;
    overflow-x: auto;
    font-size: 0.875em;
 }
 code {
    font-family: "SF Mono", Monaco, "Cascadia Code", "Roboto Mono", Consolas, "Courier New", monospace;
    background: #f1f3f4;
    padding: 0.2em 0.4em;
    border-radius: 3px;
    font-size: 0.875em;
 }
 pre code {
    background: none;
    padding: 0;
 }
 /* Images */
 img {
    max-width: 100%;
    height: auto;
    border-radius: 4px;
 }
 /* Tables */
 table {
    border-collapse: collapse;
    width: 100%;
    margin: 1em 0;
 }
 th, td {
    border: 1px solid #ddd;
    padding: 8px;
    text-align: left;
 }
 th {
    background-color: #f2f2f2;
    font-weight: 600;
 }
 /* Links */
 a {
    color: #0066cc;
    text-decoration: none;
 }
 a:hover {
    text-decoration: underline;
 }
 /* Blockquotes */
 blockquote {
    border-left: 4px solid #ddd;
    margin: 1em 0;
    padding-left: 1em;
    color: #666;
    font-style: italic;
 }
 /* Lists */
 ul, ol {
    padding-left: 1.5em;
 }
 li {
    margin-bottom: 0.25em;
 }
 /* Remove any navbar/footer elements that might leak through */
 .navbar, .nav, footer, .sidebar, .toc, .page-footer {
    display: none !important;
 }
 /* Ensure responsive behavior for iframe */
@media (max-width: 768px) {
    body {
        padding: 15px;
        font-size: 16px;
    }
    h1 { font-size: 1.75em; }
    h2 { font-size: 1.35em; }
    h3 { font-size: 1.15em; }
 }
--- a/ghost-upload/.gitlab-ci.yml
+++ b/ghost-upload/.gitlab-ci.yml
@@ -1,12 +1,13 @@
 cache:
  paths:
    - ./ghost-upload/target/
    - ./ghost-upload/cargo/
 publish:
  stage: deploy
  image: rust:latest
  script:
    - echo "Listing project root directory:"
    - ls -la
    - echo "Listing public directory:"
    - ls -la public/ || echo "public directory not found"
    - echo "Looking for index.xml:"
    - find . -name "index.xml" -type f || echo "No index.xml files found"
    - cd ./ghost-upload
    - cargo run
  needs:
--- a/ghost-upload/README.md
+++ b/ghost-upload/README.md
@@ -1,3 +1,25 @@
 # ghost-upload
-This code uploads posts from https://projects.ansonbiggs.com to https://notes.ansonbiggs.com. I couldn't figure out how to update posts, and the kagi API doesn't make it clear how long it caches results for so for now only posts that don't exist on the ghost blog will be uploaded. If you want to update content you need to manually make edits to the code and delete posts on the blog.
+This tool synchronizes posts from https://projects.ansonbiggs.com to the Ghost blog at https://notes.ansonbiggs.com.
 ## Features
 - **Clean content extraction**: Uses Quarto ghost profile to generate clean HTML instead of iframes
 - **Duplicate prevention**: Checks Ghost Admin API to avoid creating duplicate posts
 - **AI summaries**: Uses Kagi Summarizer for post summaries
 - **Dual content rendering**: GitLab CI builds both main site and ghost-optimized versions
 ## How It Works
 1. **Dual Build Process**: GitLab CI builds the site twice:
   - Main site → `public/` (normal theme with navigation)
   - Ghost content → `public/ghost-content/` (minimal theme for content extraction)
 2. **Content Extraction**: Rust tool fetches clean HTML from the ghost-content version instead of using iframes
 3. **Duplicate Detection**: Uses Ghost Admin API to check for existing posts by slug
 ## Environment Variables
 - `admin_api_key`: Ghost Admin API key (required)
 - `kagi_api_key`: Kagi Summarizer API key (required)
--- a/ghost-upload/src/main.rs
+++ b/ghost-upload/src/main.rs
@@ -45,13 +45,29 @@ impl Post {
        let slug = get_slug(link);
        let summary = summarize_url(link).await;
        // Extract content from ghost-optimized version
        let ghost_content = extract_article_content(&link).await;
        let html = html! {
            div class="ghost-summary" {
                h3 { "Summary" }
                p { (summary) }
-            iframe src=(link) style="width: 100%; height: 80vh" { }
+            }
            div class="ghost-content" {
                (maud::PreEscaped(ghost_content))
            }
            div class="ghost-footer" {
                hr {}
                p {
-                "This content was originally posted on my projects website " a href=(link) { "here." }
+                    em {
-                " The above summary was made by the " a href=("https://help.kagi.com/kagi/api/summarizer.html")
+                        "This content was originally posted on my projects website " 
-                {"Kagi Summarizer"}
+                        a href=(link) { "here" }
                        ". The above summary was generated by the " 
                        a href=("https://help.kagi.com/kagi/api/summarizer.html") {"Kagi Summarizer"} 
                        "."
                    }
                }
            }
        }.into_string();
@@ -127,24 +143,122 @@ impl Post {
 }
 fn get_slug(link: &str) -> String {
-    link.split_once("/posts/").unwrap().1.to_string()
+    link.split_once("/posts/").unwrap().1.trim_end_matches('/').to_string()
 }
-async fn check_if_post_exists(entry: &Entry) -> bool {
+async fn extract_article_content(original_link: &str) -> String {
-    let posts_url = "https://notes.ansonbiggs.com/";
+    // Convert original link to ghost-content version
-    let link = entry.links.first().unwrap().href.as_str();
+    let ghost_link = original_link.replace("projects.ansonbiggs.com", "projects.ansonbiggs.com/ghost-content");
    let slug = get_slug(link);
-    match reqwest::get(format!("{}{}", posts_url, slug)).await {
+    match reqwest::get(&ghost_link).await {
-        Ok(response) => response.status().is_success(),
+        Ok(response) => {
-        Err(_) => false,
+            match response.text().await {
                Ok(html_content) => {
                    let document = Html::parse_document(&html_content);
                    // Try different selectors to find the main content
                    let content_selectors = [
                        "#quarto-content main",
                        "#quarto-content",
                        "main",
                        "article",
                        ".content",
                        "body"
                    ];
                    for selector_str in &content_selectors {
                        if let Ok(selector) = Selector::parse(selector_str) {
                            if let Some(element) = document.select(&selector).next() {
                                let content = element.inner_html();
                                if !content.trim().is_empty() {
                                    return content;
                                }
                            }
                        }
                    }
                    // Fallback: return original content with iframe if extraction fails
                    format!(r#"<div class="fallback-iframe">
                        <p><em>Content extraction failed. Falling back to embedded view:</em></p>
                        <iframe src="{}" style="width: 100%; height: 80vh; border: none;" loading="lazy"></iframe>
                    </div>"#, original_link)
                }
                Err(_) => format!(r#"<p><em>Failed to fetch content. <a href="{}">View original post</a></em></p>"#, original_link)
            }
        }
        Err(_) => format!(r#"<p><em>Failed to fetch content. <a href="{}">View original post</a></em></p>"#, original_link)
    }
 }
 async fn fetch_feed(url: &str) -> Vec<Entry> {
    let content = reqwest::get(url).await.unwrap().text().await.unwrap();
-    let feed = parser::parse(content.as_bytes()).unwrap();
+#[derive(Deserialize, Debug)]
 struct GhostPostsResponse {
    posts: Vec<GhostPost>,
 }
 #[derive(Deserialize, Debug)]
 struct GhostPost {
    id: String,
 }
 async fn get_existing_post_id(slug: &str, token: &str) -> Option<String> {
    let client = Client::new();
    let api_url = format!("https://notes.ansonbiggs.com/ghost/api/v3/admin/posts/slug/{}/", slug);
    match client
        .get(&api_url)
        .header("Authorization", format!("Ghost {}", token))
        .send()
        .await
    {
        Ok(response) => {
            if response.status().is_success() {
                if let Ok(ghost_response) = response.json::<GhostPostsResponse>().await {
                    ghost_response.posts.first().map(|post| post.id.clone())
                } else {
                    None
                }
            } else {
                None
            }
        }
        Err(_) => None,
    }
 }
 async fn fetch_feed(path: &str) -> Vec<Entry> {
    // Debug: Print current directory and list files
    if let Ok(current_dir) = std::env::current_dir() {
        eprintln!("Current directory: {:?}", current_dir);
    }
    // Debug: List files in parent directory
    if let Ok(entries) = std::fs::read_dir("..") {
        eprintln!("Files in parent directory:");
        for entry in entries {
            if let Ok(entry) = entry {
                eprintln!("  {:?}", entry.path());
            }
        }
    }
    // Debug: Check if public directory exists
    if let Ok(entries) = std::fs::read_dir("../public") {
        eprintln!("Files in ../public:");
        for entry in entries {
            if let Ok(entry) = entry {
                eprintln!("  {:?}", entry.path());
            }
        }
    } else {
        eprintln!("../public directory does not exist or cannot be read");
    }
    // Read from local file instead of HTTP request
    let content = std::fs::read_to_string(path).expect("Failed to read RSS feed file");
    let feed = parser::parse(content.as_bytes()).expect("Failed to parse RSS feed");
    feed.entries
 }
@@ -208,7 +322,9 @@ async fn main() {
    let ghost_api_url = "https://notes.ansonbiggs.com/ghost/api/v3/admin/posts/?source=html";
    let ghost_admin_api_key = env::var("admin_api_key").unwrap();
-    let feed = "https://projects.ansonbiggs.com/index.xml";
+
    let feed = "../public/index.xml";
    // Split the key into ID and SECRET
    let (id, secret) = ghost_admin_api_key
@@ -243,7 +359,12 @@ async fn main() {
    let post_exists_futures = entries.into_iter().map(|entry| {
        let entry_clone = entry.clone();
-        async move { (entry_clone, check_if_post_exists(&entry).await) }
+        let token_clone = token.clone();
        async move { 
            let link = entry.links.first().unwrap().href.as_str();
            let slug = get_slug(link);
            (entry_clone, get_existing_post_id(&slug, &token_clone).await.is_some()) 
        }
    });
    let post_exists_results = join_all(post_exists_futures).await;
Author	SHA1	Message	Date
Anson	cf5021e682	Debug CI artifacts to locate RSS feed file	2025-08-23 00:43:59 -06:00
Anson	54f2a1bc53	Fix RSS feed parsing by reading from local artifacts - Change fetch_feed to read from local file instead of HTTP request - Update feed path to use ../public/index.xml from GitLab CI artifacts - Add better error messages for file reading and parsing failures - Resolves ParseError(NoFeedRoot) by avoiding 404 from live website	2025-08-22 23:34:32 -06:00
Anson Biggs	a6dd33ce5f	Merge branch 'ghost-content-extraction' into 'master' Claude: Ghost Content Extraction See merge request Anson-Projects/projects!11	2025-08-22 11:32:50 -07:00
Anson Biggs	556c56fee4	Claude: Ghost Content Extraction	2025-08-22 11:32:49 -07:00