Adds the worls worst html parser to the gemini module

This commit is contained in:
sloum 2020-09-12 23:16:34 -07:00
parent 97b74ea767
commit 2d5a1949f9
2 changed files with 148 additions and 7 deletions

View File

@ -78,7 +78,7 @@ func (t *TofuDigest) Match(host, localCert string, cState *tls.ConnectionState)
return fmt.Errorf("EXP")
}
if err := cert.VerifyHostname(host); err != nil {
if err := cert.VerifyHostname(host); err != nil && cert.Subject.CommonName != host {
return fmt.Errorf("Certificate error: %s", err)
}
@ -328,14 +328,18 @@ func Visit(host, port, resource string, td *TofuDigest) (Capsule, error) {
}
capsule.MimeMaj = minMajMime[0]
capsule.MimeMin = minMajMime[1]
if len(resource) > 0 && resource[0] != '/' {
resource = fmt.Sprintf("/%s", resource)
} else if resource == "" {
resource = "/"
}
currentUrl := fmt.Sprintf("gemini://%s:%s%s", host, port, resource)
if capsule.MimeMaj == "text" && capsule.MimeMin == "gemini" {
if len(resource) > 0 && resource[0] != '/' {
resource = fmt.Sprintf("/%s", resource)
} else if resource == "" {
resource = "/"
}
currentUrl := fmt.Sprintf("gemini://%s:%s%s", host, port, resource)
capsule.Content, capsule.Links = parseGemini(body, currentUrl)
} else if capsule.MimeMaj == "text" && capsule.MimeMin == "html" {
capsule.Content, capsule.Links = ParseHTML(body, currentUrl)
} else {
capsule.Content = body
}
@ -408,6 +412,7 @@ func parseGemini(b, currentUrl string) (string, []string) {
return strings.Join(splitContent[:outputIndex], "\n"), links
}
// handleRelativeUrl provides link completion
func HandleRelativeUrl(relLink, current string) (string, error) {
base, err := url.Parse(current)

136
gemini/html_parser.go Normal file
View File

@ -0,0 +1,136 @@
package gemini
import (
"fmt"
"strings"
)
type tag struct {
name string
attributes map[string]string
text string
}
var fields []string
var out strings.Builder
var links []string = make([]string, 0, 10)
var inHead bool = false
func ParseHTML(body, currentURL string) (string, []string) {
out.Reset()
links = make([]string, 0, 10)
inHead = false
body = strings.Replace(body, "<", " <", -1)
body = strings.Replace(body, ">", "> ", -1)
fields = strings.Fields(body)
var text string
for i := 0; i < len(fields); i++ {
if strings.HasPrefix(fields[i], "</") {
text = parseClose(i)
} else if fields[i][0] == '<' {
var newIndex int
text, newIndex = parseTag(i, currentURL)
i = newIndex
} else {
text = fields[i] + " "
}
if text != "" {
out.WriteString(text)
}
text = ""
}
return out.String(), links
}
func skipToClose(tag string, i int) int {
for ;i < len(fields) && fields[i] != tag; i++ {
continue
}
return i
}
func parseTag(i int, currentURL string) (string, int) {
out := ""
hitClose := false
tag := strings.Replace(fields[i], "<", "", 1)
if len(tag) < 1 {
i++
tag = fields[i]
}
if strings.HasSuffix(tag, ">") && len(tag) >= 2 {
tag = tag[:len(tag)-1]
hitClose = true
}
tag = strings.ToLower(tag)
MainSwitch:
switch tag {
case "head":
i = skipToClose("</head>", i)
case "script":
i = skipToClose("</script>", i)
case "h1":
out = "\n# "
case "h2":
out = "\n## "
case "h3":
out = "\n### "
case "li", "dt":
out = "* "
case "hr":
out = "\n\n-------------------------\n\n"
case "blockquote":
out = "\n> "
case "p", "header", "nav", "footer", "aside", "div", "main", "article", "details", "summary", "ul", "ol", "dl":
out = "\n"
case "a", "img":
target := "href="
if tag == "img" {
target = "src="
}
for ;i < len(fields) && !strings.HasPrefix(fields[i], target) ; i++ {
if strings.HasSuffix(fields[i], ">") {
hitClose = true
break MainSwitch
}
}
fields[i] = strings.Replace(fields[i], target, "", 1)
fields[i] = strings.Replace(fields[i], "\"", "", -1)
fields[i] = strings.Replace(fields[i], "'", "", -1)
if strings.HasSuffix(fields[i], ">") {
fields[i] = fields[i][:len(fields[i])-1]
hitClose = true
}
link := fields[i]
if strings.Index(link, "://") < 0 {
link, _ = HandleRelativeUrl(link, currentURL)
}
links = append(links, link)
if tag == "img" {
out = fmt.Sprintf("\n[%d]IMG\n", len(links))
} else {
out = fmt.Sprintf("[%d]", len(links))
}
}
if !hitClose {
for ;!strings.HasSuffix(fields[i], ">"); i++ {
continue
}
}
return out, i
}
func parseClose(i int) string {
tag := strings.Trim(fields[i], " \n\r\t<>/")
switch tag {
case "p", "header", "nav", "footer", "aside", "div", "main", "article", "details", "summary", "ul", "ol", "dl", "li", "dd", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6":
return "\n"
default:
return ""
}
}