From 2d5a1949f90374b3746f9c30b83764fdb3fc0e6e Mon Sep 17 00:00:00 2001 From: sloum Date: Sat, 12 Sep 2020 23:16:34 -0700 Subject: [PATCH] Adds the worls worst html parser to the gemini module --- gemini/gemini.go | 19 +++--- gemini/html_parser.go | 136 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+), 7 deletions(-) create mode 100644 gemini/html_parser.go diff --git a/gemini/gemini.go b/gemini/gemini.go index db02ca5..099e27d 100644 --- a/gemini/gemini.go +++ b/gemini/gemini.go @@ -78,7 +78,7 @@ func (t *TofuDigest) Match(host, localCert string, cState *tls.ConnectionState) return fmt.Errorf("EXP") } - if err := cert.VerifyHostname(host); err != nil { + if err := cert.VerifyHostname(host); err != nil && cert.Subject.CommonName != host { return fmt.Errorf("Certificate error: %s", err) } @@ -328,14 +328,18 @@ func Visit(host, port, resource string, td *TofuDigest) (Capsule, error) { } capsule.MimeMaj = minMajMime[0] capsule.MimeMin = minMajMime[1] + + if len(resource) > 0 && resource[0] != '/' { + resource = fmt.Sprintf("/%s", resource) + } else if resource == "" { + resource = "/" + } + currentUrl := fmt.Sprintf("gemini://%s:%s%s", host, port, resource) + if capsule.MimeMaj == "text" && capsule.MimeMin == "gemini" { - if len(resource) > 0 && resource[0] != '/' { - resource = fmt.Sprintf("/%s", resource) - } else if resource == "" { - resource = "/" - } - currentUrl := fmt.Sprintf("gemini://%s:%s%s", host, port, resource) capsule.Content, capsule.Links = parseGemini(body, currentUrl) + } else if capsule.MimeMaj == "text" && capsule.MimeMin == "html" { + capsule.Content, capsule.Links = ParseHTML(body, currentUrl) } else { capsule.Content = body } @@ -408,6 +412,7 @@ func parseGemini(b, currentUrl string) (string, []string) { return strings.Join(splitContent[:outputIndex], "\n"), links } + // handleRelativeUrl provides link completion func HandleRelativeUrl(relLink, current string) (string, error) { base, err := url.Parse(current) diff --git a/gemini/html_parser.go b/gemini/html_parser.go new file mode 100644 index 0000000..c1ff5f4 --- /dev/null +++ b/gemini/html_parser.go @@ -0,0 +1,136 @@ +package gemini + +import ( + "fmt" + "strings" +) + +type tag struct { + name string + attributes map[string]string + text string +} + +var fields []string +var out strings.Builder +var links []string = make([]string, 0, 10) +var inHead bool = false + +func ParseHTML(body, currentURL string) (string, []string) { + out.Reset() + links = make([]string, 0, 10) + inHead = false + + body = strings.Replace(body, "<", " <", -1) + body = strings.Replace(body, ">", "> ", -1) + fields = strings.Fields(body) + + var text string + for i := 0; i < len(fields); i++ { + if strings.HasPrefix(fields[i], "") && len(tag) >= 2 { + tag = tag[:len(tag)-1] + hitClose = true + } + tag = strings.ToLower(tag) +MainSwitch: + switch tag { + case "head": + i = skipToClose("", i) + case "script": + i = skipToClose("", i) + case "h1": + out = "\n# " + case "h2": + out = "\n## " + case "h3": + out = "\n### " + case "li", "dt": + out = "* " + case "hr": + out = "\n\n-------------------------\n\n" + case "blockquote": + out = "\n> " + case "p", "header", "nav", "footer", "aside", "div", "main", "article", "details", "summary", "ul", "ol", "dl": + out = "\n" + case "a", "img": + target := "href=" + if tag == "img" { + target = "src=" + } + for ;i < len(fields) && !strings.HasPrefix(fields[i], target) ; i++ { + if strings.HasSuffix(fields[i], ">") { + hitClose = true + break MainSwitch + } + } + fields[i] = strings.Replace(fields[i], target, "", 1) + fields[i] = strings.Replace(fields[i], "\"", "", -1) + fields[i] = strings.Replace(fields[i], "'", "", -1) + if strings.HasSuffix(fields[i], ">") { + fields[i] = fields[i][:len(fields[i])-1] + hitClose = true + } + link := fields[i] + if strings.Index(link, "://") < 0 { + link, _ = HandleRelativeUrl(link, currentURL) + } + + links = append(links, link) + if tag == "img" { + out = fmt.Sprintf("\n[%d]IMG\n", len(links)) + } else { + out = fmt.Sprintf("[%d]", len(links)) + } + } + if !hitClose { + for ;!strings.HasSuffix(fields[i], ">"); i++ { + continue + } + } + + return out, i +} + + +func parseClose(i int) string { + tag := strings.Trim(fields[i], " \n\r\t<>/") + switch tag { + case "p", "header", "nav", "footer", "aside", "div", "main", "article", "details", "summary", "ul", "ol", "dl", "li", "dd", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6": + return "\n" + default: + return "" + } +}