Add initial project structure with README, Cargo configuration, and main logic

- Create .gitignore to exclude target and output directories
- Initialize Cargo.toml with project metadata and dependencies
- Add README.md with project description, features, installation, and usage instructions
- Implement main.rs for extracting blog posts from HTML files and exporting to JSON and text formats
This commit is contained in:
2025-12-11 21:22:31 +01:00
commit 56306a4852
5 changed files with 744 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
/target
/output

509
Cargo.lock generated Normal file
View File

@@ -0,0 +1,509 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "bitflags"
version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
[[package]]
name = "blog-extractor"
version = "0.1.0"
dependencies = [
"scraper",
"serde",
"serde_json",
]
[[package]]
name = "cfg-if"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "cssparser"
version = "0.36.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"phf",
"smallvec",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn",
]
[[package]]
name = "derive_more"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618"
dependencies = [
"derive_more-impl",
]
[[package]]
name = "derive_more-impl"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b"
dependencies = [
"proc-macro2",
"quote",
"rustc_version",
"syn",
]
[[package]]
name = "dtoa"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]]
name = "ego-tree"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
[[package]]
name = "fastrand"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "getopts"
version = "0.2.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
dependencies = [
"unicode-width",
]
[[package]]
name = "html5ever"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6452c4751a24e1b99c3260d505eaeee76a050573e61f30ac2c924ddc7236f01e"
dependencies = [
"log",
"markup5ever",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "libc"
version = "0.2.178"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091"
[[package]]
name = "lock_api"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
dependencies = [
"scopeguard",
]
[[package]]
name = "log"
version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.36.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c3294c4d74d0742910f8c7b466f44dda9eb2d5742c1e430138df290a1e8451c"
dependencies = [
"log",
"tendril",
"web_atoms",
]
[[package]]
name = "memchr"
version = "2.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]]
name = "parking_lot"
version = "0.12.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
dependencies = [
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.9.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"smallvec",
"windows-link",
]
[[package]]
name = "phf"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
dependencies = [
"phf_macros",
"phf_shared",
"serde",
]
[[package]]
name = "phf_codegen"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
dependencies = [
"phf_generator",
"phf_shared",
]
[[package]]
name = "phf_generator"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
dependencies = [
"fastrand",
"phf_shared",
]
[[package]]
name = "phf_macros"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "phf_shared"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
dependencies = [
"siphasher",
]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro2"
version = "1.0.103"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
dependencies = [
"proc-macro2",
]
[[package]]
name = "redox_syscall"
version = "0.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
dependencies = [
"bitflags",
]
[[package]]
name = "rustc-hash"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
[[package]]
name = "rustc_version"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
dependencies = [
"semver",
]
[[package]]
name = "ryu"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "scopeguard"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93cecd86d6259499c844440546d02f55f3e17bd286e529e48d1f9f67e92315cb"
dependencies = [
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"precomputed-hash",
"selectors",
"tendril",
]
[[package]]
name = "selectors"
version = "0.33.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7"
dependencies = [
"bitflags",
"cssparser",
"derive_more",
"log",
"new_debug_unreachable",
"phf",
"phf_codegen",
"precomputed-hash",
"rustc-hash",
"servo_arc",
"smallvec",
]
[[package]]
name = "semver"
version = "1.0.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
[[package]]
name = "serde"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
dependencies = [
"serde_core",
"serde_derive",
]
[[package]]
name = "serde_core"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.228"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.145"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
"serde_core",
]
[[package]]
name = "servo_arc"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "siphasher"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
[[package]]
name = "smallvec"
version = "1.15.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
[[package]]
name = "stable_deref_trait"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "string_cache"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901"
dependencies = [
"new_debug_unreachable",
"parking_lot",
"phf_shared",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2",
"quote",
]
[[package]]
name = "syn"
version = "2.0.111"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "unicode-ident"
version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
[[package]]
name = "unicode-width"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "web_atoms"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acd0c322f146d0f8aad130ce6c187953889359584497dac6561204c8e17bb43d"
dependencies = [
"phf",
"phf_codegen",
"string_cache",
"string_cache_codegen",
]
[[package]]
name = "windows-link"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"

9
Cargo.toml Normal file
View File

@@ -0,0 +1,9 @@
[package]
name = "blog-extractor"
version = "0.1.0"
edition = "2024"
[dependencies]
scraper = "0.25.0"
serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.145"

90
README.md Normal file
View File

@@ -0,0 +1,90 @@
# Blog Extractor
A Rust tool that extracts blog posts from HTML files and exports them in multiple formats (JSON and plain text).
I just needed a simple script that would extract data from blogger.com
## Features
- **HTML Parsing**: Extracts blog post data (title, date, content) from HTML files using CSS selectors
- **JSON Export**: Saves each blog post as an individual JSON file for easy programmatic access
- **Combined Text Export**: Generates a single text file containing all posts in a human-readable format
- **Batch Processing**: Processes all HTML files in a directory in one run
- **Error Handling**: Gracefully handles processing errors without stopping the entire operation
## How It Works
The tool scans a specified directory for HTML files and extracts:
- **Title**: From the `h3.post-title` element
- **Date**: From the `time.published` element
- **Content**: From the `.post-body.entry-content` element
- **Source File**: The original filename for reference
Each post is saved in two formats:
1. Individual JSON files in `output/json/` with the same filename as the source HTML
2. A combined text file at `output/all_posts_combined.txt` containing all posts
## Installation
Make sure you have Rust installed. If not, visit [rustup.rs](https://rustup.rs/).
Clone the repository:
```bash
git clone https://git.gabrielkaszewski.dev/GKaszewski/blog-extractor.git
cd blog-extractor
```
## Usage
Run the tool with a directory path containing HTML files:
```bash
cargo run -- <directory_path>
```
Example:
```bash
cargo run -- ./blog_html_files
```
### Output
After running, you'll find:
- `output/json/` - Individual JSON files for each blog post
- `output/all_posts_combined.txt` - All posts combined in text format
## Dependencies
- **scraper** - HTML parsing and CSS selector support
- **serde** - Serialization framework
- **serde_json** - JSON serialization
## Requirements
The HTML files should contain:
- A title in an `<h3 class="post-title">` element
- A publish date in a `<time class="published">` element
- Post content in a `<div class="post-body entry-content">` element
## Project Structure
```
blog-extractor/
├── src/
│ └── main.rs # Main application logic
├── output/
│ ├── json/ # Individual JSON files
│ └── all_posts_combined.txt # Combined text file
├── Cargo.toml # Project configuration
└── README.md # This file
```
## License
This project is open source and available under the MIT License.

134
src/main.rs Normal file
View File

@@ -0,0 +1,134 @@
use scraper::{Html, Selector};
use serde::Serialize;
use std::env;
use std::fs::{self};
use std::io::Write;
use std::path::Path;
use std::process;
// We add 'Serialize' so this struct can be turned into JSON automatically
#[derive(Serialize)]
struct BlogPost {
title: String,
date: String,
content: String,
// Added source filename so you know where the data came from in the big file
source_file: String,
}
fn extract_content(file_path: &Path) -> Result<BlogPost, Box<dyn std::error::Error>> {
let html_content = fs::read_to_string(file_path)?;
let document = Html::parse_document(&html_content);
let title_selector = Selector::parse("h3.post-title").unwrap();
let date_selector = Selector::parse("time.published").unwrap();
let body_selector = Selector::parse(".post-body.entry-content").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join(""))
.unwrap_or("No Title Found".to_string())
.trim()
.to_string();
let date = document
.select(&date_selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join(""))
.unwrap_or("No Date Found".to_string())
.trim()
.to_string();
let content = document
.select(&body_selector)
.next()
.map(|el| {
el.text()
.map(|t| t.trim())
.filter(|t| !t.is_empty())
.collect::<Vec<_>>()
.join("\n")
})
.unwrap_or("No Content Found".to_string());
Ok(BlogPost {
title,
date,
content,
source_file: file_path.file_name().unwrap().to_string_lossy().to_string(),
})
}
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("Usage: cargo run -- <directory_path>");
process::exit(1);
}
let dir_path = &args[1];
let path = Path::new(dir_path);
if !path.exists() || !path.is_dir() {
eprintln!("Error: '{}' is not a valid directory.", dir_path);
process::exit(1);
}
// --- SETUP OUTPUTS ---
let output_dir = Path::new("output");
let json_dir = output_dir.join("json");
// Create directories if they don't exist
fs::create_dir_all(&json_dir).expect("Could not create output directories");
// Create/Truncate the big text file (Option B)
let big_text_path = output_dir.join("all_posts_combined.txt");
let mut big_file = fs::File::create(&big_text_path).expect("Could not create big text file");
println!("Scanning directory: {}", dir_path);
println!("Output will be saved to: ./output/\n");
let entries = fs::read_dir(path).expect("Could not read directory");
for entry in entries {
if let Ok(entry) = entry {
let input_path = entry.path();
if input_path.is_file() && input_path.extension().map_or(false, |ext| ext == "html") {
match extract_content(&input_path) {
Ok(post) => {
println!("Processing: {}", post.title);
// --- OPTION A: Save to individual JSON file ---
// We change the extension from .html to .json
let file_stem = input_path.file_stem().unwrap();
let json_path = json_dir.join(file_stem).with_extension("json");
let json_file =
fs::File::create(&json_path).expect("Failed to create JSON file");
serde_json::to_writer_pretty(json_file, &post)
.expect("Failed to write JSON");
// --- OPTION B: Append to one big text file ---
// We use `writeln!` macro to write formatted text to the file stream
writeln!(
big_file,
"==================================================\n\
FILE: {}\n\
TITLE: {}\n\
DATE: {}\n\
--------------------------------------------------\n\
{}\n\n",
post.source_file, post.title, post.date, post.content
)
.expect("Failed to write to big text file");
}
Err(e) => eprintln!("Error processing {:?}: {}", input_path.file_name(), e),
}
}
}
}
println!("\nDone! Check the 'output' folder.");
}