1
0
mirror of https://gitlab.com/Anson-Projects/projects.git synced 2025-09-19 12:02:38 +00:00

1 Commits

Author SHA1 Message Date
06357e6d27 Merge branch 'ghost-content-extraction' into 'master'
Ghost Content Extraction

See merge request Anson-Projects/projects!11
2025-08-21 23:04:14 -07:00
8 changed files with 386 additions and 88 deletions

View File

@@ -1,10 +1,15 @@
stages:
- build
- deploy
build:
stage: build
image:
name: gcr.io/kaniko-project/executor:v1.23.2-debug
entrypoint: [""]
script:
- /kaniko/executor
- >
/kaniko/executor
--context "${CI_PROJECT_DIR}"
--dockerfile "${CI_PROJECT_DIR}/Dockerfile"
--destination "${CI_REGISTRY_IMAGE}:${CI_COMMIT_BRANCH}"
@@ -22,7 +27,7 @@ staging:
paths:
- public
pages:
deploy:
stage: deploy
script:
- echo "Publishing site..."
@@ -31,6 +36,35 @@ pages:
artifacts:
paths:
- public
# Branch preview deployment (for testing)
preview:
stage: deploy
script:
- echo "Deploying branch preview..."
- echo "Preview available at preview URL"
needs:
- job: staging
optional: true
artifacts:
paths:
- public
environment:
name: preview/$CI_COMMIT_REF_SLUG
url: https://${CI_PROJECT_PATH_SLUG}-${CI_COMMIT_REF_SLUG}.gitlab.io
rules:
- if: "$CI_COMMIT_BRANCH != $CI_DEFAULT_BRANCH"
# GitLab Pages deployment (only on main branch)
pages:
stage: deploy
script:
- echo "Publishing to GitLab Pages..."
needs:
- deploy
artifacts:
paths:
- public
rules:
- if: "$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH"

46
AGENTS.md Normal file
View File

@@ -0,0 +1,46 @@
# Repository Guidelines
## Project Structure & Module Organization
- `ghost-upload/`: Rust automation for Ghost CMS publishing.
- `posts/`: Quarto posts with Julia/Python code per post directory.
- `public/`: Quarto build output (generated by `quarto render`).
- Root: Quarto config (`_quarto.yml`), shared assets, CI/CD, docs.
## Build, Test, and Development Commands
- Rust (`ghost-upload/`):
- Build: `cd ghost-upload && cargo build`
- Run: `cd ghost-upload && cargo run`
- Test: `cd ghost-upload && cargo test` (single: `cargo test <test_name>`)
- Lint: `cd ghost-upload && cargo clippy`
- Format: `cd ghost-upload && cargo fmt`
- Julia (root or `posts/*/`):
- Packages: `julia -e "using Pkg; Pkg.instantiate()"`
- Precompile: `julia -e "using Pkg; Pkg.precompile()"`
- Run notebook/script: `julia <filename>.jl`
- Quarto (docs/site):
- Build site: `quarto render --to html --output-dir public`
- Preview: `quarto preview`
- Check: `quarto check`
- Docker: `docker build -t projects .` then `docker run projects`
## Coding Style & Naming Conventions
- Rust: `cargo fmt`; fix all `cargo clippy` warnings. Use `?` over `unwrap()`. Imports: std → external → local. Naming: snake_case (fn/vars), PascalCase (types). Public docs with `///`.
- Julia: 4-space indent; spaces around operators; group `using` at top; snake_case; prefer pipelines `|>` for DataFrames; handle expected errors with try-catch.
- Quarto: Include title/date in YAML; set `echo: false`, `warning: false` for clean outputs; descriptive figure captions and alt text.
## Testing Guidelines
- Rust: Unit tests for core logic; add integration tests for API calls. Run with `cargo test`. Organize tests near code or in `tests/`.
- Julia: Validate transformations and plots visually; keep scripts deterministic.
- Quarto: Manually review rendered HTML for links, figures, and warnings.
## Commit & Pull Request Guidelines
- Commits: Use clear, conventional messages (e.g., `feat:`, `fix:`, `docs:`). Scope small and focused.
- PRs: Provide description, linked issues, steps to validate (commands), and screenshots of rendered docs when relevant.
## Security & Configuration
- Environment variables: `kagi_api_key`, `admin_api_key`. Export locally (e.g., `export admin_api_key=...`); never commit secrets.
- Dependencies: Keep minimal and up-to-date. Prefer configuration via env vars over hardcoded values.
## CI/CD & Deployment
- GitLab CI builds Docker, renders Quarto to static hosting; Rust runs separately for content sync. Avoid pipeline changes unless necessary; include rationale in PRs if modified.

View File

@@ -1,3 +1,8 @@
cache:
paths:
- ./ghost-upload/target/
- ./ghost-upload/cargo/
publish:
stage: deploy
image: rust:latest
@@ -5,8 +10,26 @@ publish:
- cd ./ghost-upload
- cargo run
needs:
- job: pages
- job: deploy
optional: true
- job: staging
optional: true
# Manual trigger to force update all Ghost posts
force-update-ghost:
stage: deploy
image: rust:latest
script:
- echo "🔄 Force updating all Ghost posts..."
- cd ./ghost-upload
- FORCE_UPDATE=true cargo run
needs:
- job: deploy
optional: true
- job: staging
optional: true
rules:
- if: "$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH"
- if: "$CI_COMMIT_BRANCH == 'ghost-content-extraction'" # Allow testing on this branch
- when: manual
allow_failure: false
variables:
FORCE_UPDATE: "true"

View File

@@ -4,22 +4,36 @@ This tool synchronizes posts from https://projects.ansonbiggs.com to the Ghost b
## Features
- **Clean content extraction**: Uses Quarto ghost profile to generate clean HTML instead of iframes
- **Duplicate prevention**: Checks Ghost Admin API to avoid creating duplicate posts
- **Automatic sync**: Only uploads new posts by default
- **Content extraction**: Fetches clean HTML content instead of using iframes
- **AI summaries**: Uses Kagi Summarizer for post summaries
- **Dual content rendering**: GitLab CI builds both main site and ghost-optimized versions
- **Force update**: Manual trigger to update all existing posts
## How It Works
## Usage
1. **Dual Build Process**: GitLab CI builds the site twice:
- Main site → `public/` (normal theme with navigation)
- Ghost content → `public/ghost-content/` (minimal theme for content extraction)
### Normal Mode (Default)
```bash
cargo run
```
Only processes new posts that don't exist on the Ghost blog.
2. **Content Extraction**: Rust tool fetches clean HTML from the ghost-content version instead of using iframes
### Force Update Mode
```bash
FORCE_UPDATE=true cargo run
```
Updates ALL posts, including existing ones. Useful for:
- Updating content after changes
- Refreshing summaries
- Applying new styling/formatting
3. **Duplicate Detection**: Uses Ghost Admin API to check for existing posts by slug
## CI/CD Integration
The GitLab CI pipeline includes:
- **Automatic sync**: Runs after each deployment
- **Manual force update**: Available as a manual trigger in GitLab UI
## Environment Variables
- `admin_api_key`: Ghost Admin API key (required)
- `kagi_api_key`: Kagi Summarizer API key (required)
- `kagi_api_key`: Kagi Summarizer API key (required)
- `FORCE_UPDATE`: Set to "true" to update all posts (optional)

View File

@@ -143,7 +143,7 @@ impl Post {
}
fn get_slug(link: &str) -> String {
link.split_once("/posts/").unwrap().1.trim_end_matches('/').to_string()
link.split_once("/posts/").unwrap().1.to_string()
}
async fn extract_article_content(original_link: &str) -> String {
@@ -191,6 +191,16 @@ async fn extract_article_content(original_link: &str) -> String {
}
}
async fn check_if_post_exists(entry: &Entry) -> bool {
let posts_url = "https://notes.ansonbiggs.com/";
let link = entry.links.first().unwrap().href.as_str();
let slug = get_slug(link);
match reqwest::get(format!("{}{}", posts_url, slug)).await {
Ok(response) => response.status().is_success(),
Err(_) => false,
}
}
#[derive(Deserialize, Debug)]
struct GhostPostsResponse {
@@ -200,6 +210,7 @@ struct GhostPostsResponse {
#[derive(Deserialize, Debug)]
struct GhostPost {
id: String,
slug: String,
}
async fn get_existing_post_id(slug: &str, token: &str) -> Option<String> {
@@ -228,47 +239,10 @@ async fn get_existing_post_id(slug: &str, token: &str) -> Option<String> {
}
async fn fetch_feed(url: &str) -> Vec<Entry> {
println!("Fetching RSS feed from: {}", url);
let response = reqwest::get(url).await;
let response = match response {
Ok(resp) => resp,
Err(e) => {
println!("Failed to fetch RSS feed: {}", e);
return vec![];
}
};
if !response.status().is_success() {
println!("RSS feed request failed with status: {}", response.status());
return vec![];
}
let content = match response.text().await {
Ok(text) => text,
Err(e) => {
println!("Failed to read RSS feed content: {}", e);
return vec![];
}
};
if content.trim().is_empty() {
println!("RSS feed content is empty");
return vec![];
}
println!("RSS feed content preview: {}", &content[..content.len().min(200)]);
let feed = match parser::parse(content.as_bytes()) {
Ok(f) => f,
Err(e) => {
println!("Failed to parse RSS feed: {:?}", e);
println!("Feed content starts with: {}", &content[..content.len().min(500)]);
return vec![];
}
};
let content = reqwest::get(url).await.unwrap().text().await.unwrap();
let feed = parser::parse(content.as_bytes()).unwrap();
println!("Successfully parsed RSS feed with {} entries", feed.entries.len());
feed.entries
}
@@ -331,7 +305,15 @@ async fn main() {
let ghost_api_url = "https://notes.ansonbiggs.com/ghost/api/v3/admin/posts/?source=html";
let ghost_admin_api_key = env::var("admin_api_key").unwrap();
// Check if force update is enabled
let force_update = env::var("FORCE_UPDATE").unwrap_or_default() == "true";
if force_update {
println!("🔄 FORCE UPDATE MODE ENABLED");
println!(" This will update ALL posts, including existing ones.");
} else {
println!("📝 NORMAL MODE - Only publishing new posts");
}
let feed = "https://projects.ansonbiggs.com/index.xml";
@@ -365,30 +347,26 @@ async fn main() {
// Prepare the post data
let entries = fetch_feed(feed).await;
if entries.is_empty() {
println!("No entries found in RSS feed or feed parsing failed. Exiting.");
return;
}
println!("Processing {} entries from RSS feed", entries.len());
let post_exists_futures = entries.into_iter().map(|entry| {
let entry_clone = entry.clone();
let token_clone = token.clone();
async move {
let link = entry.links.first().unwrap().href.as_str();
let slug = get_slug(link);
(entry_clone, get_existing_post_id(&slug, &token_clone).await.is_some())
}
});
let filtered_entries: Vec<Entry> = if force_update {
println!("🔄 Force update enabled - processing all {} posts", entries.len());
entries
} else {
let post_exists_futures = entries.into_iter().map(|entry| {
let entry_clone = entry.clone();
async move { (entry_clone, check_if_post_exists(&entry).await) }
});
let post_exists_results = join_all(post_exists_futures).await;
let post_exists_results = join_all(post_exists_futures).await;
let filtered_entries: Vec<Entry> = post_exists_results
.into_iter()
.filter_map(|(entry, exists)| if !exists { Some(entry) } else { None })
.collect();
let new_entries: Vec<Entry> = post_exists_results
.into_iter()
.filter_map(|(entry, exists)| if !exists { Some(entry) } else { None })
.collect();
println!("📝 Found {} new posts to publish", new_entries.len());
new_entries
};
if filtered_entries.is_empty() {
println!("Nothing to post.");
@@ -404,21 +382,46 @@ async fn main() {
posts: vec![post.clone()],
};
let response = client
.post(ghost_api_url)
.header("Authorization", format!("Ghost {}", token))
.json(&post_payload)
.send()
.await
.expect("Request failed");
// Check if this is an update (for force_update mode)
let (method, url) = if force_update {
if let Some(existing_id) = get_existing_post_id(&post.slug, &token).await {
println!("🔄 Updating existing post: {}", post.title);
("PUT", format!("https://notes.ansonbiggs.com/ghost/api/v3/admin/posts/{}/", existing_id))
} else {
println!("📝 Creating new post: {}", post.title);
("POST", ghost_api_url.to_string())
}
} else {
println!("📝 Creating new post: {}", post.title);
("POST", ghost_api_url.to_string())
};
let response = match method {
"PUT" => client
.put(&url)
.header("Authorization", format!("Ghost {}", token))
.json(&post_payload)
.send()
.await
.expect("Request failed"),
_ => client
.post(&url)
.header("Authorization", format!("Ghost {}", token))
.json(&post_payload)
.send()
.await
.expect("Request failed"),
};
// Check the response
if response.status().is_success() {
println!("Post {} published successfully.", post.title);
let action = if method == "PUT" { "updated" } else { "published" };
println!("✅ Post '{}' {} successfully.", post.title, action);
} else {
let action = if method == "PUT" { "update" } else { "publish" };
println!(
"Failed to publish post {}.\n\tResp: {:?}",
&post.title, response
"Failed to {} post '{}'.\n\tStatus: {}\n\tResponse: {:?}",
action, &post.title, response.status(), response.text().await.unwrap_or_default()
);
}
}

34
test-ghost-profile.md Normal file
View File

@@ -0,0 +1,34 @@
# Test Ghost Profile Output
This is a test document to validate our ghost profile setup.
## Content Structure
The ghost profile should:
- Remove navigation elements
- Use minimal styling from ghost-iframe.css
- Maintain clean article layout
- Remove table of contents
## Code Example
```julia
println("Hello from Julia!")
x = 1 + 1
```
## Regular Content
This is just some regular markdown content to see how it renders in the ghost profile.
- List item 1
- List item 2
- List item 3
**Bold text** and *italic text* should render properly.
[Link to main site](https://projects.ansonbiggs.com)
## Summary
If you can see clean, minimal styling without navigation, the ghost profile is working correctly.

55
test-local-deployment.sh Executable file
View File

@@ -0,0 +1,55 @@
#!/bin/bash
echo "🧪 Testing local deployment simulation..."
# Create test directories
mkdir -p test-output/main
mkdir -p test-output/ghost-content
echo "📁 Simulating dual-output build..."
# Test 1: Check if ghost profile exists
if grep -q "ghost:" _quarto.yml; then
echo "✅ Ghost profile configuration found"
else
echo "❌ Ghost profile not found"
exit 1
fi
# Test 2: Simulate content extraction
echo "🔍 Testing content extraction logic..."
cd ghost-upload
# Test with sample URL (without actually hitting network)
echo "📝 Testing Rust compilation and basic logic..."
if cargo check --quiet; then
echo "✅ Rust code compiles successfully"
else
echo "❌ Rust compilation failed"
exit 1
fi
cd ..
# Test 3: Check if CI would work
echo "🔧 Validating CI configuration..."
if ./validate-ghost-extraction.sh > /dev/null 2>&1; then
echo "✅ CI validation passed"
else
echo "❌ CI validation failed"
exit 1
fi
echo ""
echo "🎉 Local testing complete!"
echo ""
echo "📋 What happens in CI:"
echo " 1. Builds main site → public/"
echo " 2. Builds ghost content → public/ghost-content/"
echo " 3. Rust extracts from ghost-content URLs"
echo " 4. Posts to Ghost blog with clean HTML"
echo ""
echo "🚀 Ready for branch testing in GitLab CI!"
echo " • Download artifacts to see both outputs"
echo " • Use manual trigger to test force-update"
echo " • Check ghost-content/ folder structure"

89
validate-ghost-extraction.sh Executable file
View File

@@ -0,0 +1,89 @@
#!/bin/bash
# Simple validation script for ghost content extraction
echo "🔍 Validating ghost profile implementation..."
# Check if required files exist
echo "📁 Checking required files..."
if [ ! -f "_quarto.yml" ]; then
echo "❌ _quarto.yml not found"
exit 1
fi
if [ ! -f "ghost-iframe.css" ]; then
echo "❌ ghost-iframe.css not found"
exit 1
fi
if [ ! -f "ghost-upload/src/main.rs" ]; then
echo "❌ Rust source not found"
exit 1
fi
echo "✅ All required files present"
# Check if ghost profile is defined in _quarto.yml
echo "📋 Checking ghost profile configuration..."
if grep -q "ghost:" _quarto.yml; then
echo "✅ Ghost profile found in _quarto.yml"
else
echo "❌ Ghost profile not found in _quarto.yml"
exit 1
fi
# Check if GitLab CI builds both versions
echo "🔧 Checking GitLab CI configuration..."
if grep -q "ghost-content" .gitlab-ci.yml; then
echo "✅ GitLab CI configured for dual output"
else
echo "❌ GitLab CI not configured for ghost-content"
exit 1
fi
# Check if Rust code has extract_article_content function
echo "🦀 Checking Rust implementation..."
if grep -q "extract_article_content" ghost-upload/src/main.rs; then
echo "✅ Content extraction function found"
else
echo "❌ Content extraction function not found"
exit 1
fi
# Check if force update functionality is available
if grep -q "FORCE_UPDATE" ghost-upload/src/main.rs; then
echo "✅ Force update functionality found"
else
echo "❌ Force update functionality not found"
exit 1
fi
# Check if manual CI job is configured
if grep -q "force-update-ghost" ghost-upload/.gitlab-ci.yml; then
echo "✅ Manual force update CI job found"
else
echo "❌ Manual force update CI job not found"
exit 1
fi
# Verify Rust code compiles
echo "🛠️ Building Rust code..."
cd ghost-upload
if cargo check --quiet; then
echo "✅ Rust code compiles successfully"
else
echo "❌ Rust compilation failed"
exit 1
fi
cd ..
echo ""
echo "🎉 All validations passed!"
echo "📋 Summary of changes:"
echo " • Quarto profiles for dual-output rendering"
echo " • Ghost-optimized CSS styling"
echo " • GitLab CI builds both main site and ghost-content"
echo " • Rust extracts HTML content instead of using iframes"
echo " • Force update mode to refresh existing posts"
echo " • Manual CI trigger for content updates"
echo ""
echo "🚀 Ready for testing in CI/CD pipeline!"