Back in working state using kuchiki as kuchiki as html tree traverser.

This commit is contained in:
jacob 2021-03-05 02:51:48 -09:00
parent 0a5717b8fe
commit 54bca010e5
5 changed files with 252 additions and 164 deletions

View File

@ -5,5 +5,5 @@ authors = ["chmod777"]
edition = "2018"
[dependencies]
scraper="0.12.0"
kuchiki = {version = "0.8.1"}
html5ever = "0.25.1"

View File

@ -6,22 +6,35 @@
pub struct Config {
pub empty_lines_before_h: usize,
pub empty_lines_after_h: usize,
pub empty_lines_before_h_str: String,
pub empty_lines_after_h_str: String,
pub empty_lines_before_p: usize,
pub empty_lines_after_p: usize,
pub empty_lines_before_p_str: String,
pub empty_lines_after_p_str: String,
pub empty_lines_before_a: usize,
pub empty_lines_after_a: usize,
pub empty_lines_before_a_str: String,
pub empty_lines_after_a_str: String,
pub empty_lines_before_img: usize,
pub empty_lines_between_img_alt: usize,
pub empty_lines_after_img: usize,
pub empty_lines_before_img_str: String,
pub empty_lines_between_img_alt_str: String,
pub empty_lines_after_img_str: String,
empty_lines_before_list: usize,
empty_lines_between_list_items: usize,
empty_lines_after_list: usize,
pub empty_lines_before_list: usize,
pub empty_lines_between_list_items: usize,
pub empty_lines_after_list: usize,
pub empty_lines_before_list_str: String,
pub empty_lines_between_list_items_str: String,
pub empty_lines_after_list_str: String,
pub empty_lines_br: usize,
pub empty_lines_br_str: String,
pub convert_mydomain_links_to_gmi: Option<String>,
}
@ -30,22 +43,35 @@ impl Default for Config {
Config {
empty_lines_before_h: 0,
empty_lines_after_h: 0,
empty_lines_before_h_str: Config::empty_lines(0),
empty_lines_after_h_str: Config::empty_lines(0),
empty_lines_before_p: 0,
empty_lines_after_p: 0,
empty_lines_before_p_str: Config::empty_lines(0),
empty_lines_after_p_str: Config::empty_lines(0),
empty_lines_before_a: 0,
empty_lines_after_a: 0,
empty_lines_before_a_str: Config::empty_lines(0),
empty_lines_after_a_str: Config::empty_lines(0),
empty_lines_before_img: 0,
empty_lines_between_img_alt: 0,
empty_lines_after_img: 0,
empty_lines_before_img_str: Config::empty_lines(0),
empty_lines_between_img_alt_str: Config::empty_lines(0),
empty_lines_after_img_str: Config::empty_lines(0),
empty_lines_before_list: 0,
empty_lines_between_list_items: 0,
empty_lines_after_list: 0,
empty_lines_before_list_str: Config::empty_lines(0),
empty_lines_between_list_items_str: Config::empty_lines(0),
empty_lines_after_list_str: Config::empty_lines(0),
empty_lines_br: 1,
empty_lines_br_str: Config::empty_lines(1),
convert_mydomain_links_to_gmi: None,
}
@ -59,48 +85,4 @@ impl Config {
(0..count).map(|_| "\n").collect()
}
}
pub fn empty_lines_before_h_str(&self) -> String {
Config::empty_lines(self.empty_lines_before_h)
}
pub fn empty_lines_after_h_str(&self) -> String {
Config::empty_lines(self.empty_lines_after_h)
}
pub fn empty_lines_before_p_str(&self) -> String {
Config::empty_lines(self.empty_lines_before_p)
}
pub fn empty_lines_after_p_str(&self) -> String {
Config::empty_lines(self.empty_lines_after_p)
}
pub fn empty_lines_before_a_str(&self) -> String {
Config::empty_lines(self.empty_lines_before_a)
}
pub fn empty_lines_after_a_str(&self) -> String {
Config::empty_lines(self.empty_lines_after_a)
}
pub fn empty_lines_before_img_str(&self) -> String {
Config::empty_lines(self.empty_lines_before_img)
}
pub fn empty_lines_between_img_alt_str(&self) -> String {
Config::empty_lines(self.empty_lines_between_img_alt)
}
pub fn empty_lines_after_img_str(&self) -> String {
Config::empty_lines(self.empty_lines_after_img)
}
pub fn empty_lines_before_list_str(&self) -> String {
Config::empty_lines(self.empty_lines_before_list)
}
pub fn empty_lines_between_list_items_str(&self) -> String {
Config::empty_lines(self.empty_lines_between_list_items)
}
pub fn empty_lines_after_list_str(&self) -> String {
Config::empty_lines(self.empty_lines_after_list)
}
pub fn empty_lines_br_str(&self) -> String {
Config::empty_lines(self.empty_lines_br)
}
}

159
src/handlers.rs Normal file
View File

@ -0,0 +1,159 @@
// File: src/handlers.rs
// Author: chmod777
// Creation Date: 2021-3-3
// License: AGPLv3
use super::State;
use html5ever::local_name;
use kuchiki::{ElementData, NodeRef};
pub fn handle_header_node(
state: &mut State,
importance: usize,
node: &NodeRef,
) -> Result<&'static str, String> {
if let Some(text_node) = node.first_child() {
if let Some(header_text) = text_node.as_text() {
return handle_header(state, importance, &header_text.borrow());
} else {
return Err(String::from("Expected text in header"));
}
}
Err(String::from("Expected text node in header"))
}
pub fn handle_header(
state: &mut State,
importance: usize,
header: &str,
) -> Result<&'static str, String> {
const HEADINGS: [&'static str; 6] = ["#", "##", "###", "####", "#####", "######"];
if importance < 1 || importance > 6 {
return Err(String::from("Headers must have an importance of 1-6"));
}
state.gemtext.push_str(&format!(
"{}{} {}\n{}",
state.config.empty_lines_before_h_str,
HEADINGS[importance - 1],
header,
state.config.empty_lines_after_h_str
));
Ok("")
}
pub fn handle_paragraph_node(state: &mut State, node: &NodeRef) -> Result<&'static str, String> {
if let Some(text_node) = node.first_child() {
if let Some(paragraph_text) = text_node.as_text() {
state.gemtext.push_str(&format!(
"{}{}\n{}",
state.config.empty_lines_before_p_str,
&paragraph_text.borrow(),
state.config.empty_lines_after_p_str
));
} else {
return Err(String::from("Expected text in text paragraph"));
}
} else {
return Err(String::from("Expected text node in paragraph"));
}
Ok("")
}
pub fn handle_link_node(
state: &mut State,
element_data: &ElementData,
node: &NodeRef,
) -> Result<&'static str, String> {
let link = match element_data.attributes.borrow().get(local_name!("href")) {
Some(link) => {
let mut link = link.to_owned();
if let Some(ref mydomain) = state.config.convert_mydomain_links_to_gmi {
if link.ends_with(".html") && link.contains(mydomain.as_str()) {
link.truncate(link.len() - 4);
link.push_str("gmi");
}
}
link
}
None => return Err(String::from("Expected link in href on <a> element")),
};
let link_text = if let Some(text_node) = node.first_child() {
if let Some(link_text) = text_node.as_text() {
link_text.borrow().clone()
} else {
String::new()
}
} else {
String::new()
};
state.gemtext.push_str(&format!(
"{}=> {}",
state.config.empty_lines_before_a_str, link
));
if !link_text.is_empty() {
state.gemtext.push_str(&format!(" {}", link_text));
}
state
.gemtext
.push_str(&format!("\n{}", state.config.empty_lines_after_a_str));
Ok("")
}
pub fn handle_image_node(
state: &mut State,
element_data: &ElementData,
) -> Result<&'static str, String> {
let attributes = element_data.attributes.borrow();
match attributes.get(local_name!("src")) {
Some(src) => {
state
.gemtext
.push_str(&state.config.empty_lines_before_img_str);
state.gemtext.push_str("=> ");
state.gemtext.push_str(src);
}
None => return Err(String::from("Expected <img> to have src attribute")),
};
if let Some(title) = attributes.get(local_name!("title")) {
state.gemtext.push(' ');
state.gemtext.push_str(title);
}
state.gemtext.push('\n');
if let Some(alt) = attributes.get(local_name!("alt")) {
state
.gemtext
.push_str(&state.config.empty_lines_between_img_alt_str);
state.gemtext.push_str(alt);
state.gemtext.push('\n');
}
state
.gemtext
.push_str(&state.config.empty_lines_after_img_str);
Ok("")
}
pub fn handle_list_node(state: &mut State, node: &NodeRef) -> Result<&'static str, String> {
if let Some(child) = node.first_child() {
if let Some(text) = child.as_text() {
state.gemtext.push_str(&format!("* {}\n", text.borrow()));
} else if let Some(element_data) = child.as_element() {
return match element_data.name.local {
local_name!("a") => {
let _res = handle_link_node(state, element_data, &child)?;
Ok("Recursion Handled")
}
local_name!("img") => {
let _res = handle_image_node(state, element_data)?;
Ok("Recursion Handled")
}
_ => Err(String::from("Unsupported nested element in li element")),
};
}
}
Ok("")
}
pub fn handle_break_node(state: &mut State, _node: &NodeRef) -> Result<&'static str, String> {
state.gemtext.push_str(&state.config.empty_lines_br_str);
Ok("")
}

View File

@ -3,130 +3,74 @@
// Creation Date: 2021-2-23
// License: AGPLv3
extern crate scraper;
#[cfg(test)]
mod tests;
pub mod config;
mod handlers;
use scraper::{Html, Selector};
use html5ever::local_name;
use kuchiki::traits::*;
use kuchiki::{NodeData, NodeRef};
use config::Config;
use handlers::*;
pub struct State {
pub gemtext: String,
pub config: Config,
}
impl Default for State {
fn default() -> Self {
State {
gemtext: String::new(),
config: Config::default(),
}
}
}
fn handle_node_recurisive(state: &mut State, current: &NodeRef) {
if let Ok("Recursion Handled") = handle_node(state, current) {
} else {
for ref node in current.children() {
handle_node_recurisive(state, node);
}
}
}
fn handle_node(state: &mut State, node: &NodeRef) -> Result<&'static str, String> {
match node.data() {
NodeData::Element(element_data) => match element_data.name.local {
local_name!("h1") => handle_header_node(state, 1, node),
local_name!("h2") => handle_header_node(state, 2, node),
local_name!("h3") => handle_header_node(state, 3, node),
local_name!("h4") => handle_header_node(state, 4, node),
local_name!("h5") => handle_header_node(state, 5, node),
local_name!("h6") => handle_header_node(state, 6, node),
local_name!("p") => handle_paragraph_node(state, node),
local_name!("a") => handle_link_node(state, element_data, node),
local_name!("img") => handle_image_node(state, element_data),
local_name!("li") => handle_list_node(state, node),
local_name!("br") => handle_break_node(state, node),
_ => Ok(""),
},
_ => Ok(""),
}
}
pub fn convert(html_src: &str) -> String {
let document = Html::parse_document(html_src);
let selector = Selector::parse("a, br, h1, h2, h3, h4, h5, h6, img, li, p").unwrap();
let document = kuchiki::parse_html().one(html_src);
let config = config::Config::default();
let empty_lines_before_h = config.empty_lines_before_h_str();
let empty_lines_after_h = config.empty_lines_after_h_str();
let empty_lines_before_p = config.empty_lines_before_p_str();
let empty_lines_after_p = config.empty_lines_after_p_str();
let empty_lines_before_a = config.empty_lines_before_a_str();
let empty_lines_after_a = config.empty_lines_after_a_str();
let empty_lines_before_img = config.empty_lines_before_img_str();
let empty_lines_between_img_alt = config.empty_lines_between_img_alt_str();
let empty_lines_after_img = config.empty_lines_after_img_str();
let empty_lines_before_list = config.empty_lines_before_list_str();
let empty_lines_between_list_items = config.empty_lines_between_list_items_str();
let empty_lines_after_list = config.empty_lines_after_list_str();
let br_lines = config.empty_lines_br_str();
let convert_mydomain = Some(String::from(".html"));
let body_selector = "body";
let body = document.select(body_selector).unwrap();
let mut in_list = false;
let mut first_list_item = false;
let mut state = State::default();
state.config.convert_mydomain_links_to_gmi = Some(String::from("mydomain"));
let mut gmi = String::new();
for node in document.select(&selector) {
if !in_list && node.value().name() == "li" {
gmi.push_str(&empty_lines_before_list);
in_list = true;
first_list_item = true;
} else if in_list && node.value().name() != "li" {
gmi.push_str(&empty_lines_after_list);
in_list = false;
}
let html = node.inner_html();
match node.value().name() {
"h1" => gmi.push_str(&format!(
"{}# {}{}",
empty_lines_before_h, html, empty_lines_after_h
)),
"h2" => gmi.push_str(&format!(
"{}## {}{}",
empty_lines_before_h, html, empty_lines_after_h
)),
"h3" => gmi.push_str(&format!(
"{}### {}{}",
empty_lines_before_h, html, empty_lines_after_h
)),
"h4" => gmi.push_str(&format!(
"{}#### {}{}",
empty_lines_before_h, html, empty_lines_after_h
)),
"h5" => gmi.push_str(&format!(
"{}##### {}{}",
empty_lines_before_h, html, empty_lines_after_h
)),
"h6" => gmi.push_str(&format!(
"{}###### {}{}",
empty_lines_before_h, html, empty_lines_after_h
)),
"p" => gmi.push_str(&format!(
"{}{}{}",
empty_lines_before_p, html, empty_lines_after_p
)),
"a" => {
let link = node.value().attr("href").unwrap_or("");
if link.is_empty() {
gmi.push_str(&format!(
"{}=> {}{}",
empty_lines_before_a, html, empty_lines_after_a
));
} else {
let mut link = link.to_owned();
if let Some(ref mydomain) = convert_mydomain {
if link.ends_with(".html") && link.contains(mydomain.as_str()) {
link.truncate(link.len() - 4);
link.push_str("gmi");
}
}
gmi.push_str(&format!(
"{}=> {} {}{}",
empty_lines_before_a, link, html, empty_lines_after_a
));
}
}
"img" => {
let link = node.value().attr("src").unwrap_or("");
let title = node.value().attr("title").unwrap_or("");
let alt = node.value().attr("alt").unwrap_or("");
gmi.push_str(&empty_lines_before_img);
if title.is_empty() {
gmi.push_str(&format!("=> {}", link));
} else {
gmi.push_str(&format!("=> {} {}", link, title));
}
if !alt.is_empty() {
gmi.push_str(&format!("\n{}{}", empty_lines_between_img_alt, alt));
}
gmi.push_str(&empty_lines_after_img);
}
"li" => {
println!("li");
if !first_list_item {
gmi.push_str(&empty_lines_between_list_items);
}
gmi.push_str(&format!("* {}", html));
first_list_item = false;
}
"br" => {
gmi.push_str(&format!("{}", br_lines));
continue;
}
tag => panic!("matched an unsupported tag: {}", tag),
}
gmi.push('\n');
for node_data_ref in body {
let current = node_data_ref.as_node();
handle_node_recurisive(&mut state, current);
}
gmi
state.gemtext
}

View File

@ -3,6 +3,9 @@
// Creation Date: 2021-2-26
// License: AGPLv3
#![macro_use]
extern crate html5ever;
use html2gemini_rs::*;
use std::fs::File;
use std::{env, io::Read};