Add initial project structure with README, Cargo configuration, and main logic
- Create .gitignore to exclude target and output directories - Initialize Cargo.toml with project metadata and dependencies - Add README.md with project description, features, installation, and usage instructions - Implement main.rs for extracting blog posts from HTML files and exporting to JSON and text formats
This commit is contained in:
134
src/main.rs
Normal file
134
src/main.rs
Normal file
@@ -0,0 +1,134 @@
|
||||
use scraper::{Html, Selector};
|
||||
use serde::Serialize;
|
||||
use std::env;
|
||||
use std::fs::{self};
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::process;
|
||||
|
||||
// We add 'Serialize' so this struct can be turned into JSON automatically
|
||||
#[derive(Serialize)]
|
||||
struct BlogPost {
|
||||
title: String,
|
||||
date: String,
|
||||
content: String,
|
||||
// Added source filename so you know where the data came from in the big file
|
||||
source_file: String,
|
||||
}
|
||||
|
||||
fn extract_content(file_path: &Path) -> Result<BlogPost, Box<dyn std::error::Error>> {
|
||||
let html_content = fs::read_to_string(file_path)?;
|
||||
let document = Html::parse_document(&html_content);
|
||||
|
||||
let title_selector = Selector::parse("h3.post-title").unwrap();
|
||||
let date_selector = Selector::parse("time.published").unwrap();
|
||||
let body_selector = Selector::parse(".post-body.entry-content").unwrap();
|
||||
|
||||
let title = document
|
||||
.select(&title_selector)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<Vec<_>>().join(""))
|
||||
.unwrap_or("No Title Found".to_string())
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
let date = document
|
||||
.select(&date_selector)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<Vec<_>>().join(""))
|
||||
.unwrap_or("No Date Found".to_string())
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
let content = document
|
||||
.select(&body_selector)
|
||||
.next()
|
||||
.map(|el| {
|
||||
el.text()
|
||||
.map(|t| t.trim())
|
||||
.filter(|t| !t.is_empty())
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
})
|
||||
.unwrap_or("No Content Found".to_string());
|
||||
|
||||
Ok(BlogPost {
|
||||
title,
|
||||
date,
|
||||
content,
|
||||
source_file: file_path.file_name().unwrap().to_string_lossy().to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
if args.len() < 2 {
|
||||
eprintln!("Usage: cargo run -- <directory_path>");
|
||||
process::exit(1);
|
||||
}
|
||||
|
||||
let dir_path = &args[1];
|
||||
let path = Path::new(dir_path);
|
||||
|
||||
if !path.exists() || !path.is_dir() {
|
||||
eprintln!("Error: '{}' is not a valid directory.", dir_path);
|
||||
process::exit(1);
|
||||
}
|
||||
|
||||
// --- SETUP OUTPUTS ---
|
||||
let output_dir = Path::new("output");
|
||||
let json_dir = output_dir.join("json");
|
||||
|
||||
// Create directories if they don't exist
|
||||
fs::create_dir_all(&json_dir).expect("Could not create output directories");
|
||||
|
||||
// Create/Truncate the big text file (Option B)
|
||||
let big_text_path = output_dir.join("all_posts_combined.txt");
|
||||
let mut big_file = fs::File::create(&big_text_path).expect("Could not create big text file");
|
||||
|
||||
println!("Scanning directory: {}", dir_path);
|
||||
println!("Output will be saved to: ./output/\n");
|
||||
|
||||
let entries = fs::read_dir(path).expect("Could not read directory");
|
||||
|
||||
for entry in entries {
|
||||
if let Ok(entry) = entry {
|
||||
let input_path = entry.path();
|
||||
|
||||
if input_path.is_file() && input_path.extension().map_or(false, |ext| ext == "html") {
|
||||
match extract_content(&input_path) {
|
||||
Ok(post) => {
|
||||
println!("Processing: {}", post.title);
|
||||
|
||||
// --- OPTION A: Save to individual JSON file ---
|
||||
// We change the extension from .html to .json
|
||||
let file_stem = input_path.file_stem().unwrap();
|
||||
let json_path = json_dir.join(file_stem).with_extension("json");
|
||||
|
||||
let json_file =
|
||||
fs::File::create(&json_path).expect("Failed to create JSON file");
|
||||
serde_json::to_writer_pretty(json_file, &post)
|
||||
.expect("Failed to write JSON");
|
||||
|
||||
// --- OPTION B: Append to one big text file ---
|
||||
// We use `writeln!` macro to write formatted text to the file stream
|
||||
writeln!(
|
||||
big_file,
|
||||
"==================================================\n\
|
||||
FILE: {}\n\
|
||||
TITLE: {}\n\
|
||||
DATE: {}\n\
|
||||
--------------------------------------------------\n\
|
||||
{}\n\n",
|
||||
post.source_file, post.title, post.date, post.content
|
||||
)
|
||||
.expect("Failed to write to big text file");
|
||||
}
|
||||
Err(e) => eprintln!("Error processing {:?}: {}", input_path.file_name(), e),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
println!("\nDone! Check the 'output' folder.");
|
||||
}
|
||||
Reference in New Issue
Block a user