Add initial project structure with README, Cargo configuration, and main logic
- Create .gitignore to exclude target and output directories - Initialize Cargo.toml with project metadata and dependencies - Add README.md with project description, features, installation, and usage instructions - Implement main.rs for extracting blog posts from HTML files and exporting to JSON and text formats
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
/target
|
||||
/output
|
||||
509
Cargo.lock
generated
Normal file
509
Cargo.lock
generated
Normal file
@@ -0,0 +1,509 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3"
|
||||
|
||||
[[package]]
|
||||
name = "blog-extractor"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.36.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2"
|
||||
dependencies = [
|
||||
"cssparser-macros",
|
||||
"dtoa-short",
|
||||
"itoa",
|
||||
"phf",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser-macros"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10b768e943bed7bf2cab53df09f4bc34bfd217cdb57d971e769874c9a6710618"
|
||||
dependencies = [
|
||||
"derive_more-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more-impl"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d286bfdaf75e988b4a78e013ecd79c581e06399ab53fbacd2d916c2f904f30b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustc_version",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtoa"
|
||||
version = "1.0.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6add3b8cff394282be81f3fc1a0605db594ed69890078ca6e2cab1c408bcf04"
|
||||
|
||||
[[package]]
|
||||
name = "dtoa-short"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
|
||||
dependencies = [
|
||||
"dtoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ego-tree"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||
|
||||
[[package]]
|
||||
name = "futf"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
|
||||
dependencies = [
|
||||
"mac",
|
||||
"new_debug_unreachable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getopts"
|
||||
version = "0.2.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6452c4751a24e1b99c3260d505eaeee76a050573e61f30ac2c924ddc7236f01e"
|
||||
dependencies = [
|
||||
"log",
|
||||
"markup5ever",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.178"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
|
||||
dependencies = [
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
|
||||
|
||||
[[package]]
|
||||
name = "mac"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.36.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c3294c4d74d0742910f8c7b466f44dda9eb2d5742c1e430138df290a1e8451c"
|
||||
dependencies = [
|
||||
"log",
|
||||
"tendril",
|
||||
"web_atoms",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273"
|
||||
|
||||
[[package]]
|
||||
name = "new_debug_unreachable"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
"parking_lot_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.9.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"smallvec",
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf"
|
||||
dependencies = [
|
||||
"phf_macros",
|
||||
"phf_shared",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"phf_shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_macros"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "precomputed-hash"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.103"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5ee95bc4ef87b8d5ba32e8b7714ccc834865276eab0aed5c9958d00ec45f49e8"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
|
||||
dependencies = [
|
||||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "scraper"
|
||||
version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93cecd86d6259499c844440546d02f55f3e17bd286e529e48d1f9f67e92315cb"
|
||||
dependencies = [
|
||||
"cssparser",
|
||||
"ego-tree",
|
||||
"getopts",
|
||||
"html5ever",
|
||||
"precomputed-hash",
|
||||
"selectors",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "selectors"
|
||||
version = "0.33.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "feef350c36147532e1b79ea5c1f3791373e61cbd9a6a2615413b3807bb164fb7"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cssparser",
|
||||
"derive_more",
|
||||
"log",
|
||||
"new_debug_unreachable",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"precomputed-hash",
|
||||
"rustc-hash",
|
||||
"servo_arc",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_core"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.228"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.145"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "servo_arc"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "170fb83ab34de17dc69aa7c67482b22218ddb85da56546f9bd6b929e32a05930"
|
||||
dependencies = [
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.15.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
|
||||
|
||||
[[package]]
|
||||
name = "string_cache"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901"
|
||||
dependencies = [
|
||||
"new_debug_unreachable",
|
||||
"parking_lot",
|
||||
"phf_shared",
|
||||
"precomputed-hash",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "string_cache_codegen"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69"
|
||||
dependencies = [
|
||||
"phf_generator",
|
||||
"phf_shared",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.111"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tendril"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
|
||||
dependencies = [
|
||||
"futf",
|
||||
"mac",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "web_atoms"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "acd0c322f146d0f8aad130ce6c187953889359584497dac6561204c8e17bb43d"
|
||||
dependencies = [
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"string_cache",
|
||||
"string_cache_codegen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
||||
9
Cargo.toml
Normal file
9
Cargo.toml
Normal file
@@ -0,0 +1,9 @@
|
||||
[package]
|
||||
name = "blog-extractor"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
scraper = "0.25.0"
|
||||
serde = { version = "1.0.228", features = ["derive"] }
|
||||
serde_json = "1.0.145"
|
||||
90
README.md
Normal file
90
README.md
Normal file
@@ -0,0 +1,90 @@
|
||||
# Blog Extractor
|
||||
|
||||
A Rust tool that extracts blog posts from HTML files and exports them in multiple formats (JSON and plain text).
|
||||
|
||||
I just needed a simple script that would extract data from blogger.com
|
||||
|
||||
## Features
|
||||
|
||||
- **HTML Parsing**: Extracts blog post data (title, date, content) from HTML files using CSS selectors
|
||||
- **JSON Export**: Saves each blog post as an individual JSON file for easy programmatic access
|
||||
- **Combined Text Export**: Generates a single text file containing all posts in a human-readable format
|
||||
- **Batch Processing**: Processes all HTML files in a directory in one run
|
||||
- **Error Handling**: Gracefully handles processing errors without stopping the entire operation
|
||||
|
||||
## How It Works
|
||||
|
||||
The tool scans a specified directory for HTML files and extracts:
|
||||
|
||||
- **Title**: From the `h3.post-title` element
|
||||
- **Date**: From the `time.published` element
|
||||
- **Content**: From the `.post-body.entry-content` element
|
||||
- **Source File**: The original filename for reference
|
||||
|
||||
Each post is saved in two formats:
|
||||
|
||||
1. Individual JSON files in `output/json/` with the same filename as the source HTML
|
||||
2. A combined text file at `output/all_posts_combined.txt` containing all posts
|
||||
|
||||
## Installation
|
||||
|
||||
Make sure you have Rust installed. If not, visit [rustup.rs](https://rustup.rs/).
|
||||
|
||||
Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://git.gabrielkaszewski.dev/GKaszewski/blog-extractor.git
|
||||
cd blog-extractor
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Run the tool with a directory path containing HTML files:
|
||||
|
||||
```bash
|
||||
cargo run -- <directory_path>
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
cargo run -- ./blog_html_files
|
||||
```
|
||||
|
||||
### Output
|
||||
|
||||
After running, you'll find:
|
||||
|
||||
- `output/json/` - Individual JSON files for each blog post
|
||||
- `output/all_posts_combined.txt` - All posts combined in text format
|
||||
|
||||
## Dependencies
|
||||
|
||||
- **scraper** - HTML parsing and CSS selector support
|
||||
- **serde** - Serialization framework
|
||||
- **serde_json** - JSON serialization
|
||||
|
||||
## Requirements
|
||||
|
||||
The HTML files should contain:
|
||||
|
||||
- A title in an `<h3 class="post-title">` element
|
||||
- A publish date in a `<time class="published">` element
|
||||
- Post content in a `<div class="post-body entry-content">` element
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
blog-extractor/
|
||||
├── src/
|
||||
│ └── main.rs # Main application logic
|
||||
├── output/
|
||||
│ ├── json/ # Individual JSON files
|
||||
│ └── all_posts_combined.txt # Combined text file
|
||||
├── Cargo.toml # Project configuration
|
||||
└── README.md # This file
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
This project is open source and available under the MIT License.
|
||||
134
src/main.rs
Normal file
134
src/main.rs
Normal file
@@ -0,0 +1,134 @@
|
||||
use scraper::{Html, Selector};
|
||||
use serde::Serialize;
|
||||
use std::env;
|
||||
use std::fs::{self};
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::process;
|
||||
|
||||
// We add 'Serialize' so this struct can be turned into JSON automatically
|
||||
#[derive(Serialize)]
|
||||
struct BlogPost {
|
||||
title: String,
|
||||
date: String,
|
||||
content: String,
|
||||
// Added source filename so you know where the data came from in the big file
|
||||
source_file: String,
|
||||
}
|
||||
|
||||
fn extract_content(file_path: &Path) -> Result<BlogPost, Box<dyn std::error::Error>> {
|
||||
let html_content = fs::read_to_string(file_path)?;
|
||||
let document = Html::parse_document(&html_content);
|
||||
|
||||
let title_selector = Selector::parse("h3.post-title").unwrap();
|
||||
let date_selector = Selector::parse("time.published").unwrap();
|
||||
let body_selector = Selector::parse(".post-body.entry-content").unwrap();
|
||||
|
||||
let title = document
|
||||
.select(&title_selector)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<Vec<_>>().join(""))
|
||||
.unwrap_or("No Title Found".to_string())
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
let date = document
|
||||
.select(&date_selector)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<Vec<_>>().join(""))
|
||||
.unwrap_or("No Date Found".to_string())
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
let content = document
|
||||
.select(&body_selector)
|
||||
.next()
|
||||
.map(|el| {
|
||||
el.text()
|
||||
.map(|t| t.trim())
|
||||
.filter(|t| !t.is_empty())
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n")
|
||||
})
|
||||
.unwrap_or("No Content Found".to_string());
|
||||
|
||||
Ok(BlogPost {
|
||||
title,
|
||||
date,
|
||||
content,
|
||||
source_file: file_path.file_name().unwrap().to_string_lossy().to_string(),
|
||||
})
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let args: Vec<String> = env::args().collect();
|
||||
|
||||
if args.len() < 2 {
|
||||
eprintln!("Usage: cargo run -- <directory_path>");
|
||||
process::exit(1);
|
||||
}
|
||||
|
||||
let dir_path = &args[1];
|
||||
let path = Path::new(dir_path);
|
||||
|
||||
if !path.exists() || !path.is_dir() {
|
||||
eprintln!("Error: '{}' is not a valid directory.", dir_path);
|
||||
process::exit(1);
|
||||
}
|
||||
|
||||
// --- SETUP OUTPUTS ---
|
||||
let output_dir = Path::new("output");
|
||||
let json_dir = output_dir.join("json");
|
||||
|
||||
// Create directories if they don't exist
|
||||
fs::create_dir_all(&json_dir).expect("Could not create output directories");
|
||||
|
||||
// Create/Truncate the big text file (Option B)
|
||||
let big_text_path = output_dir.join("all_posts_combined.txt");
|
||||
let mut big_file = fs::File::create(&big_text_path).expect("Could not create big text file");
|
||||
|
||||
println!("Scanning directory: {}", dir_path);
|
||||
println!("Output will be saved to: ./output/\n");
|
||||
|
||||
let entries = fs::read_dir(path).expect("Could not read directory");
|
||||
|
||||
for entry in entries {
|
||||
if let Ok(entry) = entry {
|
||||
let input_path = entry.path();
|
||||
|
||||
if input_path.is_file() && input_path.extension().map_or(false, |ext| ext == "html") {
|
||||
match extract_content(&input_path) {
|
||||
Ok(post) => {
|
||||
println!("Processing: {}", post.title);
|
||||
|
||||
// --- OPTION A: Save to individual JSON file ---
|
||||
// We change the extension from .html to .json
|
||||
let file_stem = input_path.file_stem().unwrap();
|
||||
let json_path = json_dir.join(file_stem).with_extension("json");
|
||||
|
||||
let json_file =
|
||||
fs::File::create(&json_path).expect("Failed to create JSON file");
|
||||
serde_json::to_writer_pretty(json_file, &post)
|
||||
.expect("Failed to write JSON");
|
||||
|
||||
// --- OPTION B: Append to one big text file ---
|
||||
// We use `writeln!` macro to write formatted text to the file stream
|
||||
writeln!(
|
||||
big_file,
|
||||
"==================================================\n\
|
||||
FILE: {}\n\
|
||||
TITLE: {}\n\
|
||||
DATE: {}\n\
|
||||
--------------------------------------------------\n\
|
||||
{}\n\n",
|
||||
post.source_file, post.title, post.date, post.content
|
||||
)
|
||||
.expect("Failed to write to big text file");
|
||||
}
|
||||
Err(e) => eprintln!("Error processing {:?}: {}", input_path.file_name(), e),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
println!("\nDone! Check the 'output' folder.");
|
||||
}
|
||||
Reference in New Issue
Block a user