1
0
mirror of https://gitlab.com/Anson-Projects/projects.git synced 2025-09-19 12:02:38 +00:00

2 Commits

Author SHA1 Message Date
a6dd33ce5f Merge branch 'ghost-content-extraction' into 'master'
Claude: Ghost Content Extraction

See merge request Anson-Projects/projects!11
2025-08-22 11:32:50 -07:00
556c56fee4 Claude: Ghost Content Extraction 2025-08-22 11:32:49 -07:00
7 changed files with 799 additions and 784 deletions

View File

@@ -14,8 +14,10 @@ staging:
stage: deploy stage: deploy
image: ${CI_REGISTRY_IMAGE}:${CI_COMMIT_BRANCH} image: ${CI_REGISTRY_IMAGE}:${CI_COMMIT_BRANCH}
script: script:
- echo "Building the project with Quarto..." - echo "Building the main website with Quarto..."
- quarto render --to html --output-dir public - quarto render --to html --output-dir public
- echo "Building Ghost-optimized version..."
- quarto render --profile ghost --to html --output-dir public/ghost-content
artifacts: artifacts:
paths: paths:
- public - public

View File

@@ -1,7 +1,9 @@
project: project:
type: website type: website
website: profiles:
default:
website:
title: "Anson's Projects" title: "Anson's Projects"
site-url: https://projects.ansonbiggs.com site-url: https://projects.ansonbiggs.com
description: A Blog for Technical Topics description: A Blog for Technical Topics
@@ -15,11 +17,26 @@ website:
# - icon: gitlab # - icon: gitlab
# href: https://gitlab.com/MisterBiggs # href: https://gitlab.com/MisterBiggs
open-graph: true open-graph: true
format: format:
html: html:
theme: zephyr theme: zephyr
css: styles.css css: styles.css
# toc: true # toc: true
ghost:
website:
title: "Anson's Projects"
site-url: https://projects.ansonbiggs.com
description: A Blog for Technical Topics
navbar: false
open-graph: true
format:
html:
theme: none
css: ghost-iframe.css
toc: false
page-layout: article
title-block-banner: false
execute: execute:
freeze: true freeze: true

129
ghost-iframe.css Normal file
View File

@@ -0,0 +1,129 @@
/* Ghost iframe optimized styles */
body {
font-family: system-ui, -apple-system, sans-serif;
line-height: 1.6;
color: #333;
max-width: 100%;
margin: 0;
padding: 20px;
background: white;
}
/* Remove any potential margins/padding */
html, body {
margin: 0;
padding: 0;
box-sizing: border-box;
}
/* Ensure content flows naturally */
#quarto-content {
max-width: none;
padding: 0;
margin: 0;
}
/* Style headings for Ghost */
h1, h2, h3, h4, h5, h6 {
margin-top: 1.5em;
margin-bottom: 0.5em;
font-weight: 600;
line-height: 1.3;
}
h1 { font-size: 2em; }
h2 { font-size: 1.5em; }
h3 { font-size: 1.25em; }
/* Code blocks */
pre {
background: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 6px;
padding: 1rem;
overflow-x: auto;
font-size: 0.875em;
}
code {
font-family: "SF Mono", Monaco, "Cascadia Code", "Roboto Mono", Consolas, "Courier New", monospace;
background: #f1f3f4;
padding: 0.2em 0.4em;
border-radius: 3px;
font-size: 0.875em;
}
pre code {
background: none;
padding: 0;
}
/* Images */
img {
max-width: 100%;
height: auto;
border-radius: 4px;
}
/* Tables */
table {
border-collapse: collapse;
width: 100%;
margin: 1em 0;
}
th, td {
border: 1px solid #ddd;
padding: 8px;
text-align: left;
}
th {
background-color: #f2f2f2;
font-weight: 600;
}
/* Links */
a {
color: #0066cc;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
/* Blockquotes */
blockquote {
border-left: 4px solid #ddd;
margin: 1em 0;
padding-left: 1em;
color: #666;
font-style: italic;
}
/* Lists */
ul, ol {
padding-left: 1.5em;
}
li {
margin-bottom: 0.25em;
}
/* Remove any navbar/footer elements that might leak through */
.navbar, .nav, footer, .sidebar, .toc, .page-footer {
display: none !important;
}
/* Ensure responsive behavior for iframe */
@media (max-width: 768px) {
body {
padding: 15px;
font-size: 16px;
}
h1 { font-size: 1.75em; }
h2 { font-size: 1.35em; }
h3 { font-size: 1.15em; }
}

View File

@@ -1,8 +1,3 @@
cache:
paths:
- ./ghost-upload/target/
- ./ghost-upload/cargo/
publish: publish:
stage: deploy stage: deploy
image: rust:latest image: rust:latest
@@ -13,17 +8,3 @@ publish:
- pages - pages
rules: rules:
- if: "$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH" - if: "$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH"
publish_update:
stage: deploy
image: rust:latest
variables:
UPDATE_EXISTING: "true"
script:
- cd ./ghost-upload
- cargo run
needs:
- pages
rules:
- if: "$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH"
when: manual

1072
ghost-upload/Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,17 +1,25 @@
# ghost-upload # ghost-upload
This tool uploads posts from https://projects.ansonbiggs.com to https://notes.ansonbiggs.com. This tool synchronizes posts from https://projects.ansonbiggs.com to the Ghost blog at https://notes.ansonbiggs.com.
What's new: ## Features
- Uses the Ghost Admin API to check for existing posts by slug instead of probing the public site.
- Optional update support: set `UPDATE_EXISTING=true` to update an existing post in-place (via `PUT /ghost/api/v3/admin/posts/{id}?source=html`).
- Safer slug handling (trims trailing `/` and falls back to the last path segment).
Env vars: - **Clean content extraction**: Uses Quarto ghost profile to generate clean HTML instead of iframes
- `admin_api_key`: Ghost Admin API key in `key_id:secret` format. - **Duplicate prevention**: Checks Ghost Admin API to avoid creating duplicate posts
- `kagi_api_key`: Kagi Summarizer API key. - **AI summaries**: Uses Kagi Summarizer for post summaries
- `UPDATE_EXISTING` (optional): if `true`/`1`, update posts that already exist in Ghost. - **Dual content rendering**: GitLab CI builds both main site and ghost-optimized versions
Notes: ## How It Works
- Updates use optimistic concurrency by sending the current `updated_at` from Ghost. If someone edits a post in Ghost after we fetch it, the update will fail with a 409 and be reported in the console.
- Summaries are always regenerated when creating or updating; if you want to avoid re-summarizing on updates, leave `UPDATE_EXISTING` unset. 1. **Dual Build Process**: GitLab CI builds the site twice:
- Main site → `public/` (normal theme with navigation)
- Ghost content → `public/ghost-content/` (minimal theme for content extraction)
2. **Content Extraction**: Rust tool fetches clean HTML from the ghost-content version instead of using iframes
3. **Duplicate Detection**: Uses Ghost Admin API to check for existing posts by slug
## Environment Variables
- `admin_api_key`: Ghost Admin API key (required)
- `kagi_api_key`: Kagi Summarizer API key (required)

View File

@@ -1,5 +1,6 @@
use feed_rs::model::Entry; use feed_rs::model::Entry;
use feed_rs::parser; use feed_rs::parser;
use futures::future::join_all;
use jsonwebtoken::{encode, Algorithm, EncodingKey, Header}; use jsonwebtoken::{encode, Algorithm, EncodingKey, Header};
use maud::html; use maud::html;
use reqwest::Client; use reqwest::Client;
@@ -19,29 +20,6 @@ struct PostPayload {
posts: Vec<Post>, posts: Vec<Post>,
} }
#[derive(Serialize, Debug, Clone)]
struct UpdatePost {
id: String,
title: String,
slug: String,
html: String,
status: String,
published_at: String,
updated_at: String,
canonical_url: String,
tags: Vec<String>,
feature_image: Option<String>,
feature_image_alt: Option<String>,
feature_image_caption: Option<String>,
meta_description: Option<String>,
custom_excerpt: Option<String>,
}
#[derive(Serialize, Debug)]
struct UpdatePayload {
posts: Vec<UpdatePost>,
}
#[derive(Serialize, Debug, Clone)] #[derive(Serialize, Debug, Clone)]
struct Post { struct Post {
title: String, title: String,
@@ -67,13 +45,29 @@ impl Post {
let slug = get_slug(link); let slug = get_slug(link);
let summary = summarize_url(link).await; let summary = summarize_url(link).await;
// Extract content from ghost-optimized version
let ghost_content = extract_article_content(&link).await;
let html = html! { let html = html! {
div class="ghost-summary" {
h3 { "Summary" }
p { (summary) } p { (summary) }
iframe src=(link) style="width: 100%; height: 80vh" { } }
div class="ghost-content" {
(maud::PreEscaped(ghost_content))
}
div class="ghost-footer" {
hr {}
p { p {
"This content was originally posted on my projects website " a href=(link) { "here." } em {
" The above summary was made by the " a href=("https://help.kagi.com/kagi/api/summarizer.html") "This content was originally posted on my projects website "
{"Kagi Summarizer"} a href=(link) { "here" }
". The above summary was generated by the "
a href=("https://help.kagi.com/kagi/api/summarizer.html") {"Kagi Summarizer"}
"."
}
}
} }
}.into_string(); }.into_string();
@@ -143,54 +137,94 @@ impl Post {
meta_description, meta_description,
custom_excerpt, custom_excerpt,
}; };
dbg!(&x);
x x
} }
} }
fn get_slug(link: &str) -> String { fn get_slug(link: &str) -> String {
// Prefer portion after "/posts/" if present, otherwise fall back to the last path segment link.split_once("/posts/").unwrap().1.trim_end_matches('/').to_string()
let raw = match link.split_once("/posts/") { }
Some((_, rest)) => rest,
None => link.rsplit('/').next().unwrap_or(link), async fn extract_article_content(original_link: &str) -> String {
}; // Convert original link to ghost-content version
raw.trim_end_matches('/') let ghost_link = original_link.replace("projects.ansonbiggs.com", "projects.ansonbiggs.com/ghost-content");
.to_string()
match reqwest::get(&ghost_link).await {
Ok(response) => {
match response.text().await {
Ok(html_content) => {
let document = Html::parse_document(&html_content);
// Try different selectors to find the main content
let content_selectors = [
"#quarto-content main",
"#quarto-content",
"main",
"article",
".content",
"body"
];
for selector_str in &content_selectors {
if let Ok(selector) = Selector::parse(selector_str) {
if let Some(element) = document.select(&selector).next() {
let content = element.inner_html();
if !content.trim().is_empty() {
return content;
}
}
}
}
// Fallback: return original content with iframe if extraction fails
format!(r#"<div class="fallback-iframe">
<p><em>Content extraction failed. Falling back to embedded view:</em></p>
<iframe src="{}" style="width: 100%; height: 80vh; border: none;" loading="lazy"></iframe>
</div>"#, original_link)
}
Err(_) => format!(r#"<p><em>Failed to fetch content. <a href="{}">View original post</a></em></p>"#, original_link)
}
}
Err(_) => format!(r#"<p><em>Failed to fetch content. <a href="{}">View original post</a></em></p>"#, original_link)
}
}
#[derive(Deserialize, Debug)]
struct GhostPostsResponse {
posts: Vec<GhostPost>,
} }
#[derive(Deserialize, Debug)] #[derive(Deserialize, Debug)]
struct GhostPostSummary { struct GhostPost {
id: String, id: String,
slug: String,
updated_at: String,
} }
#[derive(Deserialize, Debug)] async fn get_existing_post_id(slug: &str, token: &str) -> Option<String> {
struct GhostPostsResponse<T> { let client = Client::new();
posts: Vec<T>, let api_url = format!("https://notes.ansonbiggs.com/ghost/api/v3/admin/posts/slug/{}/", slug);
}
async fn get_existing_post_by_slug( match client
client: &Client, .get(&api_url)
ghost_admin_base: &str,
token: &str,
slug: &str,
) -> Option<GhostPostSummary> {
// Use Ghost Admin API to search by slug
let url = format!(
"{}/posts/?filter=slug:{}&fields=id,slug,updated_at",
ghost_admin_base, slug
);
let resp = client
.get(url)
.header("Authorization", format!("Ghost {}", token)) .header("Authorization", format!("Ghost {}", token))
.send() .send()
.await .await
.ok()?; {
if !resp.status().is_success() { Ok(response) => {
return None; if response.status().is_success() {
if let Ok(ghost_response) = response.json::<GhostPostsResponse>().await {
ghost_response.posts.first().map(|post| post.id.clone())
} else {
None
}
} else {
None
}
}
Err(_) => None,
} }
let json = resp.json::<GhostPostsResponse<GhostPostSummary>>().await.ok()?;
json.posts.into_iter().next()
} }
async fn fetch_feed(url: &str) -> Vec<Entry> { async fn fetch_feed(url: &str) -> Vec<Entry> {
@@ -257,10 +291,11 @@ async fn summarize_url(url: &str) -> String {
} }
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
let ghost_admin_base = "https://notes.ansonbiggs.com/ghost/api/v3/admin"; let ghost_api_url = "https://notes.ansonbiggs.com/ghost/api/v3/admin/posts/?source=html";
let ghost_posts_create_url = format!("{}/posts/?source=html", ghost_admin_base);
let ghost_admin_api_key = env::var("admin_api_key").unwrap(); let ghost_admin_api_key = env::var("admin_api_key").unwrap();
let feed = "https://projects.ansonbiggs.com/index.xml"; let feed = "https://projects.ansonbiggs.com/index.xml";
// Split the key into ID and SECRET // Split the key into ID and SECRET
@@ -291,87 +326,56 @@ async fn main() {
) )
.expect("JWT encoding failed"); .expect("JWT encoding failed");
let client = Client::new();
// Prepare the post data // Prepare the post data
let entries = fetch_feed(feed).await; let entries = fetch_feed(feed).await;
// Control whether to update existing posts via env var let post_exists_futures = entries.into_iter().map(|entry| {
let update_existing = env::var("UPDATE_EXISTING").map(|v| v == "1" || v.eq_ignore_ascii_case("true")).unwrap_or(false); let entry_clone = entry.clone();
let token_clone = token.clone();
for entry in entries { async move {
let link = entry.links.first().unwrap().href.as_str(); let link = entry.links.first().unwrap().href.as_str();
let slug = get_slug(link); let slug = get_slug(link);
(entry_clone, get_existing_post_id(&slug, &token_clone).await.is_some())
}
});
let existing = get_existing_post_by_slug(&client, ghost_admin_base, &token, &slug).await; let post_exists_results = join_all(post_exists_futures).await;
match existing { let filtered_entries: Vec<Entry> = post_exists_results
None => { .into_iter()
// Create new post .filter_map(|(entry, exists)| if !exists { Some(entry) } else { None })
let post = Post::new(entry.clone()).await; .collect();
let post_payload = PostPayload { posts: vec![post.clone()] };
if filtered_entries.is_empty() {
println!("Nothing to post.");
return;
}
let post_futures = filtered_entries.into_iter().map(Post::new);
let client = Client::new();
for post in join_all(post_futures).await {
let post_payload = PostPayload {
posts: vec![post.clone()],
};
let response = client let response = client
.post(&ghost_posts_create_url) .post(ghost_api_url)
.header("Authorization", format!("Ghost {}", token)) .header("Authorization", format!("Ghost {}", token))
.json(&post_payload) .json(&post_payload)
.send() .send()
.await .await
.expect("Request failed"); .expect("Request failed");
// Check the response
if response.status().is_success() { if response.status().is_success() {
println!("Post {} published successfully.", post.title); println!("Post {} published successfully.", post.title);
} else { } else {
println!( println!(
"Failed to publish post {}.\n\tStatus: {}", "Failed to publish post {}.\n\tResp: {:?}",
&post.title, &post.title, response
response.status()
); );
} }
} }
Some(summary) => {
if !update_existing {
println!("Post '{}' exists (slug: {}), skipping.", entry.title.unwrap().content, slug);
continue;
}
// Update existing post
let post = Post::new(entry.clone()).await;
let update = UpdatePost {
id: summary.id,
title: post.title,
slug: post.slug,
html: post.html,
status: post.status,
published_at: post.published_at,
updated_at: summary.updated_at,
canonical_url: post.canonical_url,
tags: post.tags,
feature_image: post.feature_image,
feature_image_alt: post.feature_image_alt,
feature_image_caption: post.feature_image_caption,
meta_description: post.meta_description,
custom_excerpt: post.custom_excerpt,
};
let update_url = format!("{}/posts/{}/?source=html", ghost_admin_base, update.id);
let response = client
.put(update_url)
.header("Authorization", format!("Ghost {}", token))
.json(&UpdatePayload { posts: vec![update] })
.send()
.await
.expect("Update request failed");
if response.status().is_success() {
println!("Post '{}' updated successfully.", entry.title.unwrap().content);
} else {
println!(
"Failed to update post '{}' (status: {}).",
entry.title.unwrap().content,
response.status()
);
}
}
}
}
} }