Add initial project structure with README, Cargo configuration, and main logic

- Create .gitignore to exclude target and output directories
- Initialize Cargo.toml with project metadata and dependencies
- Add README.md with project description, features, installation, and usage instructions
- Implement main.rs for extracting blog posts from HTML files and exporting to JSON and text formats
This commit is contained in:
2025-12-11 21:22:31 +01:00
commit 56306a4852
5 changed files with 744 additions and 0 deletions

134
src/main.rs Normal file
View File

@@ -0,0 +1,134 @@
use scraper::{Html, Selector};
use serde::Serialize;
use std::env;
use std::fs::{self};
use std::io::Write;
use std::path::Path;
use std::process;
// We add 'Serialize' so this struct can be turned into JSON automatically
#[derive(Serialize)]
struct BlogPost {
title: String,
date: String,
content: String,
// Added source filename so you know where the data came from in the big file
source_file: String,
}
fn extract_content(file_path: &Path) -> Result<BlogPost, Box<dyn std::error::Error>> {
let html_content = fs::read_to_string(file_path)?;
let document = Html::parse_document(&html_content);
let title_selector = Selector::parse("h3.post-title").unwrap();
let date_selector = Selector::parse("time.published").unwrap();
let body_selector = Selector::parse(".post-body.entry-content").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join(""))
.unwrap_or("No Title Found".to_string())
.trim()
.to_string();
let date = document
.select(&date_selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join(""))
.unwrap_or("No Date Found".to_string())
.trim()
.to_string();
let content = document
.select(&body_selector)
.next()
.map(|el| {
el.text()
.map(|t| t.trim())
.filter(|t| !t.is_empty())
.collect::<Vec<_>>()
.join("\n")
})
.unwrap_or("No Content Found".to_string());
Ok(BlogPost {
title,
date,
content,
source_file: file_path.file_name().unwrap().to_string_lossy().to_string(),
})
}
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("Usage: cargo run -- <directory_path>");
process::exit(1);
}
let dir_path = &args[1];
let path = Path::new(dir_path);
if !path.exists() || !path.is_dir() {
eprintln!("Error: '{}' is not a valid directory.", dir_path);
process::exit(1);
}
// --- SETUP OUTPUTS ---
let output_dir = Path::new("output");
let json_dir = output_dir.join("json");
// Create directories if they don't exist
fs::create_dir_all(&json_dir).expect("Could not create output directories");
// Create/Truncate the big text file (Option B)
let big_text_path = output_dir.join("all_posts_combined.txt");
let mut big_file = fs::File::create(&big_text_path).expect("Could not create big text file");
println!("Scanning directory: {}", dir_path);
println!("Output will be saved to: ./output/\n");
let entries = fs::read_dir(path).expect("Could not read directory");
for entry in entries {
if let Ok(entry) = entry {
let input_path = entry.path();
if input_path.is_file() && input_path.extension().map_or(false, |ext| ext == "html") {
match extract_content(&input_path) {
Ok(post) => {
println!("Processing: {}", post.title);
// --- OPTION A: Save to individual JSON file ---
// We change the extension from .html to .json
let file_stem = input_path.file_stem().unwrap();
let json_path = json_dir.join(file_stem).with_extension("json");
let json_file =
fs::File::create(&json_path).expect("Failed to create JSON file");
serde_json::to_writer_pretty(json_file, &post)
.expect("Failed to write JSON");
// --- OPTION B: Append to one big text file ---
// We use `writeln!` macro to write formatted text to the file stream
writeln!(
big_file,
"==================================================\n\
FILE: {}\n\
TITLE: {}\n\
DATE: {}\n\
--------------------------------------------------\n\
{}\n\n",
post.source_file, post.title, post.date, post.content
)
.expect("Failed to write to big text file");
}
Err(e) => eprintln!("Error processing {:?}: {}", input_path.file_name(), e),
}
}
}
}
println!("\nDone! Check the 'output' folder.");
}