Back in working state using kuchiki as kuchiki as html tree traverser.
This commit is contained in:
parent
0a5717b8fe
commit
54bca010e5
|
@ -5,5 +5,5 @@ authors = ["chmod777"]
|
|||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
scraper="0.12.0"
|
||||
|
||||
kuchiki = {version = "0.8.1"}
|
||||
html5ever = "0.25.1"
|
|
@ -6,22 +6,35 @@
|
|||
pub struct Config {
|
||||
pub empty_lines_before_h: usize,
|
||||
pub empty_lines_after_h: usize,
|
||||
pub empty_lines_before_h_str: String,
|
||||
pub empty_lines_after_h_str: String,
|
||||
|
||||
pub empty_lines_before_p: usize,
|
||||
pub empty_lines_after_p: usize,
|
||||
pub empty_lines_before_p_str: String,
|
||||
pub empty_lines_after_p_str: String,
|
||||
|
||||
pub empty_lines_before_a: usize,
|
||||
pub empty_lines_after_a: usize,
|
||||
pub empty_lines_before_a_str: String,
|
||||
pub empty_lines_after_a_str: String,
|
||||
|
||||
pub empty_lines_before_img: usize,
|
||||
pub empty_lines_between_img_alt: usize,
|
||||
pub empty_lines_after_img: usize,
|
||||
pub empty_lines_before_img_str: String,
|
||||
pub empty_lines_between_img_alt_str: String,
|
||||
pub empty_lines_after_img_str: String,
|
||||
|
||||
empty_lines_before_list: usize,
|
||||
empty_lines_between_list_items: usize,
|
||||
empty_lines_after_list: usize,
|
||||
pub empty_lines_before_list: usize,
|
||||
pub empty_lines_between_list_items: usize,
|
||||
pub empty_lines_after_list: usize,
|
||||
pub empty_lines_before_list_str: String,
|
||||
pub empty_lines_between_list_items_str: String,
|
||||
pub empty_lines_after_list_str: String,
|
||||
|
||||
pub empty_lines_br: usize,
|
||||
pub empty_lines_br_str: String,
|
||||
|
||||
pub convert_mydomain_links_to_gmi: Option<String>,
|
||||
}
|
||||
|
@ -30,22 +43,35 @@ impl Default for Config {
|
|||
Config {
|
||||
empty_lines_before_h: 0,
|
||||
empty_lines_after_h: 0,
|
||||
empty_lines_before_h_str: Config::empty_lines(0),
|
||||
empty_lines_after_h_str: Config::empty_lines(0),
|
||||
|
||||
empty_lines_before_p: 0,
|
||||
empty_lines_after_p: 0,
|
||||
empty_lines_before_p_str: Config::empty_lines(0),
|
||||
empty_lines_after_p_str: Config::empty_lines(0),
|
||||
|
||||
empty_lines_before_a: 0,
|
||||
empty_lines_after_a: 0,
|
||||
empty_lines_before_a_str: Config::empty_lines(0),
|
||||
empty_lines_after_a_str: Config::empty_lines(0),
|
||||
|
||||
empty_lines_before_img: 0,
|
||||
empty_lines_between_img_alt: 0,
|
||||
empty_lines_after_img: 0,
|
||||
empty_lines_before_img_str: Config::empty_lines(0),
|
||||
empty_lines_between_img_alt_str: Config::empty_lines(0),
|
||||
empty_lines_after_img_str: Config::empty_lines(0),
|
||||
|
||||
empty_lines_before_list: 0,
|
||||
empty_lines_between_list_items: 0,
|
||||
empty_lines_after_list: 0,
|
||||
empty_lines_before_list_str: Config::empty_lines(0),
|
||||
empty_lines_between_list_items_str: Config::empty_lines(0),
|
||||
empty_lines_after_list_str: Config::empty_lines(0),
|
||||
|
||||
empty_lines_br: 1,
|
||||
empty_lines_br_str: Config::empty_lines(1),
|
||||
|
||||
convert_mydomain_links_to_gmi: None,
|
||||
}
|
||||
|
@ -59,48 +85,4 @@ impl Config {
|
|||
(0..count).map(|_| "\n").collect()
|
||||
}
|
||||
}
|
||||
pub fn empty_lines_before_h_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_before_h)
|
||||
}
|
||||
pub fn empty_lines_after_h_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_after_h)
|
||||
}
|
||||
|
||||
pub fn empty_lines_before_p_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_before_p)
|
||||
}
|
||||
pub fn empty_lines_after_p_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_after_p)
|
||||
}
|
||||
|
||||
pub fn empty_lines_before_a_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_before_a)
|
||||
}
|
||||
pub fn empty_lines_after_a_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_after_a)
|
||||
}
|
||||
|
||||
pub fn empty_lines_before_img_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_before_img)
|
||||
}
|
||||
pub fn empty_lines_between_img_alt_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_between_img_alt)
|
||||
}
|
||||
pub fn empty_lines_after_img_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_after_img)
|
||||
}
|
||||
|
||||
pub fn empty_lines_before_list_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_before_list)
|
||||
}
|
||||
pub fn empty_lines_between_list_items_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_between_list_items)
|
||||
}
|
||||
pub fn empty_lines_after_list_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_after_list)
|
||||
}
|
||||
|
||||
pub fn empty_lines_br_str(&self) -> String {
|
||||
Config::empty_lines(self.empty_lines_br)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,159 @@
|
|||
// File: src/handlers.rs
|
||||
// Author: chmod777
|
||||
// Creation Date: 2021-3-3
|
||||
// License: AGPLv3
|
||||
|
||||
use super::State;
|
||||
|
||||
use html5ever::local_name;
|
||||
use kuchiki::{ElementData, NodeRef};
|
||||
|
||||
pub fn handle_header_node(
|
||||
state: &mut State,
|
||||
importance: usize,
|
||||
node: &NodeRef,
|
||||
) -> Result<&'static str, String> {
|
||||
if let Some(text_node) = node.first_child() {
|
||||
if let Some(header_text) = text_node.as_text() {
|
||||
return handle_header(state, importance, &header_text.borrow());
|
||||
} else {
|
||||
return Err(String::from("Expected text in header"));
|
||||
}
|
||||
}
|
||||
Err(String::from("Expected text node in header"))
|
||||
}
|
||||
pub fn handle_header(
|
||||
state: &mut State,
|
||||
importance: usize,
|
||||
header: &str,
|
||||
) -> Result<&'static str, String> {
|
||||
const HEADINGS: [&'static str; 6] = ["#", "##", "###", "####", "#####", "######"];
|
||||
if importance < 1 || importance > 6 {
|
||||
return Err(String::from("Headers must have an importance of 1-6"));
|
||||
}
|
||||
state.gemtext.push_str(&format!(
|
||||
"{}{} {}\n{}",
|
||||
state.config.empty_lines_before_h_str,
|
||||
HEADINGS[importance - 1],
|
||||
header,
|
||||
state.config.empty_lines_after_h_str
|
||||
));
|
||||
Ok("")
|
||||
}
|
||||
pub fn handle_paragraph_node(state: &mut State, node: &NodeRef) -> Result<&'static str, String> {
|
||||
if let Some(text_node) = node.first_child() {
|
||||
if let Some(paragraph_text) = text_node.as_text() {
|
||||
state.gemtext.push_str(&format!(
|
||||
"{}{}\n{}",
|
||||
state.config.empty_lines_before_p_str,
|
||||
¶graph_text.borrow(),
|
||||
state.config.empty_lines_after_p_str
|
||||
));
|
||||
} else {
|
||||
return Err(String::from("Expected text in text paragraph"));
|
||||
}
|
||||
} else {
|
||||
return Err(String::from("Expected text node in paragraph"));
|
||||
}
|
||||
Ok("")
|
||||
}
|
||||
pub fn handle_link_node(
|
||||
state: &mut State,
|
||||
element_data: &ElementData,
|
||||
node: &NodeRef,
|
||||
) -> Result<&'static str, String> {
|
||||
let link = match element_data.attributes.borrow().get(local_name!("href")) {
|
||||
Some(link) => {
|
||||
let mut link = link.to_owned();
|
||||
if let Some(ref mydomain) = state.config.convert_mydomain_links_to_gmi {
|
||||
if link.ends_with(".html") && link.contains(mydomain.as_str()) {
|
||||
link.truncate(link.len() - 4);
|
||||
link.push_str("gmi");
|
||||
}
|
||||
}
|
||||
link
|
||||
}
|
||||
None => return Err(String::from("Expected link in href on <a> element")),
|
||||
};
|
||||
let link_text = if let Some(text_node) = node.first_child() {
|
||||
if let Some(link_text) = text_node.as_text() {
|
||||
link_text.borrow().clone()
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
state.gemtext.push_str(&format!(
|
||||
"{}=> {}",
|
||||
state.config.empty_lines_before_a_str, link
|
||||
));
|
||||
if !link_text.is_empty() {
|
||||
state.gemtext.push_str(&format!(" {}", link_text));
|
||||
}
|
||||
state
|
||||
.gemtext
|
||||
.push_str(&format!("\n{}", state.config.empty_lines_after_a_str));
|
||||
Ok("")
|
||||
}
|
||||
pub fn handle_image_node(
|
||||
state: &mut State,
|
||||
element_data: &ElementData,
|
||||
) -> Result<&'static str, String> {
|
||||
let attributes = element_data.attributes.borrow();
|
||||
match attributes.get(local_name!("src")) {
|
||||
Some(src) => {
|
||||
state
|
||||
.gemtext
|
||||
.push_str(&state.config.empty_lines_before_img_str);
|
||||
state.gemtext.push_str("=> ");
|
||||
state.gemtext.push_str(src);
|
||||
}
|
||||
None => return Err(String::from("Expected <img> to have src attribute")),
|
||||
};
|
||||
|
||||
if let Some(title) = attributes.get(local_name!("title")) {
|
||||
state.gemtext.push(' ');
|
||||
state.gemtext.push_str(title);
|
||||
}
|
||||
state.gemtext.push('\n');
|
||||
|
||||
if let Some(alt) = attributes.get(local_name!("alt")) {
|
||||
state
|
||||
.gemtext
|
||||
.push_str(&state.config.empty_lines_between_img_alt_str);
|
||||
state.gemtext.push_str(alt);
|
||||
state.gemtext.push('\n');
|
||||
}
|
||||
|
||||
state
|
||||
.gemtext
|
||||
.push_str(&state.config.empty_lines_after_img_str);
|
||||
|
||||
Ok("")
|
||||
}
|
||||
pub fn handle_list_node(state: &mut State, node: &NodeRef) -> Result<&'static str, String> {
|
||||
if let Some(child) = node.first_child() {
|
||||
if let Some(text) = child.as_text() {
|
||||
state.gemtext.push_str(&format!("* {}\n", text.borrow()));
|
||||
} else if let Some(element_data) = child.as_element() {
|
||||
return match element_data.name.local {
|
||||
local_name!("a") => {
|
||||
let _res = handle_link_node(state, element_data, &child)?;
|
||||
Ok("Recursion Handled")
|
||||
}
|
||||
local_name!("img") => {
|
||||
let _res = handle_image_node(state, element_data)?;
|
||||
Ok("Recursion Handled")
|
||||
}
|
||||
_ => Err(String::from("Unsupported nested element in li element")),
|
||||
};
|
||||
}
|
||||
}
|
||||
Ok("")
|
||||
}
|
||||
pub fn handle_break_node(state: &mut State, _node: &NodeRef) -> Result<&'static str, String> {
|
||||
state.gemtext.push_str(&state.config.empty_lines_br_str);
|
||||
Ok("")
|
||||
}
|
174
src/lib.rs
174
src/lib.rs
|
@ -3,130 +3,74 @@
|
|||
// Creation Date: 2021-2-23
|
||||
// License: AGPLv3
|
||||
|
||||
extern crate scraper;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub mod config;
|
||||
mod handlers;
|
||||
|
||||
use scraper::{Html, Selector};
|
||||
use html5ever::local_name;
|
||||
use kuchiki::traits::*;
|
||||
use kuchiki::{NodeData, NodeRef};
|
||||
|
||||
use config::Config;
|
||||
use handlers::*;
|
||||
|
||||
pub struct State {
|
||||
pub gemtext: String,
|
||||
pub config: Config,
|
||||
}
|
||||
impl Default for State {
|
||||
fn default() -> Self {
|
||||
State {
|
||||
gemtext: String::new(),
|
||||
config: Config::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_node_recurisive(state: &mut State, current: &NodeRef) {
|
||||
if let Ok("Recursion Handled") = handle_node(state, current) {
|
||||
} else {
|
||||
for ref node in current.children() {
|
||||
handle_node_recurisive(state, node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_node(state: &mut State, node: &NodeRef) -> Result<&'static str, String> {
|
||||
match node.data() {
|
||||
NodeData::Element(element_data) => match element_data.name.local {
|
||||
local_name!("h1") => handle_header_node(state, 1, node),
|
||||
local_name!("h2") => handle_header_node(state, 2, node),
|
||||
local_name!("h3") => handle_header_node(state, 3, node),
|
||||
local_name!("h4") => handle_header_node(state, 4, node),
|
||||
local_name!("h5") => handle_header_node(state, 5, node),
|
||||
local_name!("h6") => handle_header_node(state, 6, node),
|
||||
local_name!("p") => handle_paragraph_node(state, node),
|
||||
local_name!("a") => handle_link_node(state, element_data, node),
|
||||
local_name!("img") => handle_image_node(state, element_data),
|
||||
local_name!("li") => handle_list_node(state, node),
|
||||
local_name!("br") => handle_break_node(state, node),
|
||||
_ => Ok(""),
|
||||
},
|
||||
_ => Ok(""),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn convert(html_src: &str) -> String {
|
||||
let document = Html::parse_document(html_src);
|
||||
let selector = Selector::parse("a, br, h1, h2, h3, h4, h5, h6, img, li, p").unwrap();
|
||||
let document = kuchiki::parse_html().one(html_src);
|
||||
|
||||
let config = config::Config::default();
|
||||
let empty_lines_before_h = config.empty_lines_before_h_str();
|
||||
let empty_lines_after_h = config.empty_lines_after_h_str();
|
||||
let empty_lines_before_p = config.empty_lines_before_p_str();
|
||||
let empty_lines_after_p = config.empty_lines_after_p_str();
|
||||
let empty_lines_before_a = config.empty_lines_before_a_str();
|
||||
let empty_lines_after_a = config.empty_lines_after_a_str();
|
||||
let empty_lines_before_img = config.empty_lines_before_img_str();
|
||||
let empty_lines_between_img_alt = config.empty_lines_between_img_alt_str();
|
||||
let empty_lines_after_img = config.empty_lines_after_img_str();
|
||||
let empty_lines_before_list = config.empty_lines_before_list_str();
|
||||
let empty_lines_between_list_items = config.empty_lines_between_list_items_str();
|
||||
let empty_lines_after_list = config.empty_lines_after_list_str();
|
||||
let br_lines = config.empty_lines_br_str();
|
||||
let convert_mydomain = Some(String::from(".html"));
|
||||
let body_selector = "body";
|
||||
let body = document.select(body_selector).unwrap();
|
||||
|
||||
let mut in_list = false;
|
||||
let mut first_list_item = false;
|
||||
let mut state = State::default();
|
||||
state.config.convert_mydomain_links_to_gmi = Some(String::from("mydomain"));
|
||||
|
||||
let mut gmi = String::new();
|
||||
for node in document.select(&selector) {
|
||||
if !in_list && node.value().name() == "li" {
|
||||
gmi.push_str(&empty_lines_before_list);
|
||||
in_list = true;
|
||||
first_list_item = true;
|
||||
} else if in_list && node.value().name() != "li" {
|
||||
gmi.push_str(&empty_lines_after_list);
|
||||
in_list = false;
|
||||
}
|
||||
|
||||
let html = node.inner_html();
|
||||
match node.value().name() {
|
||||
"h1" => gmi.push_str(&format!(
|
||||
"{}# {}{}",
|
||||
empty_lines_before_h, html, empty_lines_after_h
|
||||
)),
|
||||
"h2" => gmi.push_str(&format!(
|
||||
"{}## {}{}",
|
||||
empty_lines_before_h, html, empty_lines_after_h
|
||||
)),
|
||||
"h3" => gmi.push_str(&format!(
|
||||
"{}### {}{}",
|
||||
empty_lines_before_h, html, empty_lines_after_h
|
||||
)),
|
||||
"h4" => gmi.push_str(&format!(
|
||||
"{}#### {}{}",
|
||||
empty_lines_before_h, html, empty_lines_after_h
|
||||
)),
|
||||
"h5" => gmi.push_str(&format!(
|
||||
"{}##### {}{}",
|
||||
empty_lines_before_h, html, empty_lines_after_h
|
||||
)),
|
||||
"h6" => gmi.push_str(&format!(
|
||||
"{}###### {}{}",
|
||||
empty_lines_before_h, html, empty_lines_after_h
|
||||
)),
|
||||
"p" => gmi.push_str(&format!(
|
||||
"{}{}{}",
|
||||
empty_lines_before_p, html, empty_lines_after_p
|
||||
)),
|
||||
"a" => {
|
||||
let link = node.value().attr("href").unwrap_or("");
|
||||
if link.is_empty() {
|
||||
gmi.push_str(&format!(
|
||||
"{}=> {}{}",
|
||||
empty_lines_before_a, html, empty_lines_after_a
|
||||
));
|
||||
} else {
|
||||
let mut link = link.to_owned();
|
||||
if let Some(ref mydomain) = convert_mydomain {
|
||||
if link.ends_with(".html") && link.contains(mydomain.as_str()) {
|
||||
link.truncate(link.len() - 4);
|
||||
link.push_str("gmi");
|
||||
}
|
||||
}
|
||||
gmi.push_str(&format!(
|
||||
"{}=> {} {}{}",
|
||||
empty_lines_before_a, link, html, empty_lines_after_a
|
||||
));
|
||||
}
|
||||
}
|
||||
"img" => {
|
||||
let link = node.value().attr("src").unwrap_or("");
|
||||
let title = node.value().attr("title").unwrap_or("");
|
||||
let alt = node.value().attr("alt").unwrap_or("");
|
||||
gmi.push_str(&empty_lines_before_img);
|
||||
if title.is_empty() {
|
||||
gmi.push_str(&format!("=> {}", link));
|
||||
} else {
|
||||
gmi.push_str(&format!("=> {} {}", link, title));
|
||||
}
|
||||
if !alt.is_empty() {
|
||||
gmi.push_str(&format!("\n{}{}", empty_lines_between_img_alt, alt));
|
||||
}
|
||||
gmi.push_str(&empty_lines_after_img);
|
||||
}
|
||||
"li" => {
|
||||
println!("li");
|
||||
if !first_list_item {
|
||||
gmi.push_str(&empty_lines_between_list_items);
|
||||
}
|
||||
gmi.push_str(&format!("* {}", html));
|
||||
first_list_item = false;
|
||||
}
|
||||
"br" => {
|
||||
gmi.push_str(&format!("{}", br_lines));
|
||||
continue;
|
||||
}
|
||||
tag => panic!("matched an unsupported tag: {}", tag),
|
||||
}
|
||||
gmi.push('\n');
|
||||
for node_data_ref in body {
|
||||
let current = node_data_ref.as_node();
|
||||
handle_node_recurisive(&mut state, current);
|
||||
}
|
||||
gmi
|
||||
|
||||
state.gemtext
|
||||
}
|
||||
|
|
|
@ -3,6 +3,9 @@
|
|||
// Creation Date: 2021-2-26
|
||||
// License: AGPLv3
|
||||
|
||||
#![macro_use]
|
||||
extern crate html5ever;
|
||||
|
||||
use html2gemini_rs::*;
|
||||
use std::fs::File;
|
||||
use std::{env, io::Read};
|
||||
|
|
Loading…
Reference in New Issue