diff --git a/.gitignore b/.gitignore index 75d85e8d..d3ef0199 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ dock.sh GoBuilds dist +hugolib/hugo_stats.json resources/sunset.jpg vendor diff --git a/config/commonConfig.go b/config/commonConfig.go index 17d5619b..ba99260a 100644 --- a/config/commonConfig.go +++ b/config/commonConfig.go @@ -29,11 +29,16 @@ import ( var DefaultBuild = Build{ UseResourceCacheWhen: "fallback", + WriteStats: false, } // Build holds some build related condfiguration. type Build struct { UseResourceCacheWhen string // never, fallback, always. Default is fallback + + // When enabled, will collect and write a hugo_stats.json with some build + // related aggregated data (e.g. CSS class names). + WriteStats bool } func (b Build) UseResourceCache(err error) bool { diff --git a/go.mod b/go.mod index 4b75840b..c12caa8f 100644 --- a/go.mod +++ b/go.mod @@ -55,7 +55,7 @@ require ( go.opencensus.io v0.22.0 // indirect gocloud.dev v0.15.0 golang.org/x/image v0.0.0-20191214001246-9130b4cfad52 - golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553 // indirect + golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553 golang.org/x/oauth2 v0.0.0-20190523182746-aaccbc9213b0 // indirect golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e golang.org/x/sys v0.0.0-20200107144601-ef85f5a75ddf // indirect diff --git a/hugolib/hugo_sites.go b/hugolib/hugo_sites.go index dca9e496..9ff4d36c 100644 --- a/hugolib/hugo_sites.go +++ b/hugolib/hugo_sites.go @@ -408,7 +408,11 @@ func applyDeps(cfg deps.DepsCfg, sites ...*Site) error { s.Deps = d // Set up the main publishing chain. - pub, err := publisher.NewDestinationPublisher(d.PathSpec.BaseFs.PublishFs, s.outputFormatsConfig, s.mediaTypesConfig, cfg.Cfg) + pub, err := publisher.NewDestinationPublisher( + d.ResourceSpec, + s.outputFormatsConfig, + s.mediaTypesConfig, + ) if err != nil { return err diff --git a/hugolib/hugo_sites_build.go b/hugolib/hugo_sites_build.go index 6a65605f..fac20e88 100644 --- a/hugolib/hugo_sites_build.go +++ b/hugolib/hugo_sites_build.go @@ -16,11 +16,17 @@ package hugolib import ( "bytes" "context" + "encoding/json" "fmt" "os" + "path/filepath" "runtime/trace" "strings" + "github.com/gohugoio/hugo/publisher" + + "github.com/gohugoio/hugo/hugofs" + "github.com/gohugoio/hugo/common/para" "github.com/gohugoio/hugo/config" "github.com/gohugoio/hugo/resources/postpub" @@ -146,10 +152,10 @@ func (h *HugoSites) Build(config BuildCfg, events ...fsnotify.Event) error { if err != nil { h.SendError(err) } - } - if err := h.postProcess(); err != nil { - h.SendError(err) + if err = h.postProcess(); err != nil { + h.SendError(err) + } } if h.Metrics != nil { @@ -337,6 +343,12 @@ func (h *HugoSites) render(config *BuildCfg) error { } func (h *HugoSites) postProcess() error { + // Make sure to write any build stats to disk first so it's available + // to the post processors. + if err := h.writeBuildStats(); err != nil { + return err + } + var toPostProcess []resource.OriginProvider for _, s := range h.Sites { for _, v := range s.ResourceSpec.PostProcessResources { @@ -422,3 +434,47 @@ func (h *HugoSites) postProcess() error { return g.Wait() } + +type publishStats struct { + CSSClasses string `json:"cssClasses"` +} + +func (h *HugoSites) writeBuildStats() error { + if !h.ResourceSpec.BuildConfig.WriteStats { + return nil + } + + htmlElements := &publisher.HTMLElements{} + for _, s := range h.Sites { + stats := s.publisher.PublishStats() + htmlElements.Merge(stats.HTMLElements) + } + + htmlElements.Sort() + + stats := publisher.PublishStats{ + HTMLElements: *htmlElements, + } + + js, err := json.MarshalIndent(stats, "", " ") + if err != nil { + return err + } + + filename := filepath.Join(h.WorkingDir, "hugo_stats.json") + + // Make sure it's always written to the OS fs. + if err := afero.WriteFile(hugofs.Os, filename, js, 0666); err != nil { + return err + } + + // Write to the destination, too, if a mem fs is in play. + if h.Fs.Source != hugofs.Os { + if err := afero.WriteFile(h.Fs.Destination, filename, js, 0666); err != nil { + return err + } + } + + return nil + +} diff --git a/hugolib/site_test.go b/hugolib/site_test.go index 0b05aac1..e404d80a 100644 --- a/hugolib/site_test.go +++ b/hugolib/site_test.go @@ -980,3 +980,47 @@ func TestRefIssues(t *testing.T) { b.AssertFileContent("public/post/nested-a/content-a/index.html", `Content: http://example.com/post/nested-b/content-b/`) } + +func TestClassCollector(t *testing.T) { + b := newTestSitesBuilder(t) + b.WithConfigFile("toml", ` + +[build] + writeStats = true + +`) + + b.WithTemplates("index.html", ` + +
Foo
+ +Some text. + +
Foo
+`) + + b.WithContent("p1.md", "") + + b.Build(BuildCfg{}) + + b.AssertFileContent("hugo_stats.json", ` +{ + "htmlElements": { + "tags": [ + "div" + ], + "classes": [ + "a", + "b", + "c", + "d", + "e" + ], + "ids": [ + "el1", + "el2" + ] + } + } +`) +} diff --git a/publisher/htmlElementsCollector.go b/publisher/htmlElementsCollector.go new file mode 100644 index 00000000..c6e0d3f0 --- /dev/null +++ b/publisher/htmlElementsCollector.go @@ -0,0 +1,268 @@ +// Copyright 2020 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package publisher + +import ( + "github.com/gohugoio/hugo/helpers" + "golang.org/x/net/html" + yaml "gopkg.in/yaml.v2" + + "bytes" + "sort" + "strings" + "sync" +) + +func newHTMLElementsCollector() *htmlElementsCollector { + return &htmlElementsCollector{ + elementSet: make(map[string]bool), + } +} + +func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *cssClassCollectorWriter { + return &cssClassCollectorWriter{ + collector: collector, + } +} + +// HTMLElements holds lists of tags and attribute values for classes and id. +type HTMLElements struct { + Tags []string `json:"tags"` + Classes []string `json:"classes"` + IDs []string `json:"ids"` +} + +func (h *HTMLElements) Merge(other HTMLElements) { + h.Tags = append(h.Tags, other.Tags...) + h.Classes = append(h.Classes, other.Classes...) + h.IDs = append(h.IDs, other.IDs...) + + h.Tags = helpers.UniqueStringsReuse(h.Tags) + h.Classes = helpers.UniqueStringsReuse(h.Classes) + h.IDs = helpers.UniqueStringsReuse(h.IDs) + +} + +func (h *HTMLElements) Sort() { + sort.Strings(h.Tags) + sort.Strings(h.Classes) + sort.Strings(h.IDs) +} + +type cssClassCollectorWriter struct { + collector *htmlElementsCollector + buff bytes.Buffer + + isCollecting bool + dropValue bool + inQuote bool +} + +func (w *cssClassCollectorWriter) Write(p []byte) (n int, err error) { + n = len(p) + i := 0 + + for i < len(p) { + if !w.isCollecting { + for ; i < len(p); i++ { + b := p[i] + if b == '<' { + w.startCollecting() + break + } + } + } + + if w.isCollecting { + for ; i < len(p); i++ { + b := p[i] + if !w.inQuote && b == '/' { + // End element, we don't care about those. + w.endCollecting(true) + break + } + w.toggleIfQuote(b) + if !w.inQuote && b == '>' { + w.endCollecting(false) + break + } + w.buff.WriteByte(b) + } + + if !w.isCollecting { + if w.dropValue { + w.buff.Reset() + } else { + // First check if we have processed this element before. + w.collector.mu.RLock() + + // See https://github.com/dominikh/go-tools/issues/723 + //lint:ignore S1030 This construct avoids memory allocation for the string. + seen := w.collector.elementSet[string(w.buff.Bytes())] + w.collector.mu.RUnlock() + if seen { + w.buff.Reset() + continue + } + + s := w.buff.String() + + w.buff.Reset() + + el := parseHTMLElement(s) + + w.collector.mu.Lock() + w.collector.elementSet[s] = true + if el.Tag != "" { + w.collector.elements = append(w.collector.elements, el) + } + w.collector.mu.Unlock() + } + } + } + } + + return +} + +func (c *cssClassCollectorWriter) endCollecting(drop bool) { + c.isCollecting = false + c.inQuote = false + c.dropValue = drop +} + +func (c *cssClassCollectorWriter) startCollecting() { + c.isCollecting = true + c.dropValue = false +} + +func (c *cssClassCollectorWriter) toggleIfQuote(b byte) { + if isQuote(b) { + c.inQuote = !c.inQuote + } +} + +type htmlElement struct { + Tag string + Classes []string + IDs []string +} + +type htmlElementsCollector struct { + // Contains the raw HTML string. We will get the same element + // several times, and want to avoid costly reparsing when this + // is used for aggregated data only. + elementSet map[string]bool + + elements []htmlElement + + mu sync.RWMutex +} + +func (c *htmlElementsCollector) getHTMLElements() HTMLElements { + + var ( + classes []string + ids []string + tags []string + ) + + for _, el := range c.elements { + classes = append(classes, el.Classes...) + ids = append(ids, el.IDs...) + tags = append(tags, el.Tag) + } + + classes = helpers.UniqueStringsSorted(classes) + ids = helpers.UniqueStringsSorted(ids) + tags = helpers.UniqueStringsSorted(tags) + + els := HTMLElements{ + Classes: classes, + IDs: ids, + Tags: tags, + } + + return els +} + +func isQuote(b byte) bool { + return b == '"' || b == '\'' +} + +var htmlJsonFixer = strings.NewReplacer(", ", "\n") + +func parseHTMLElement(elStr string) (el htmlElement) { + elStr = strings.TrimSpace(elStr) + if !strings.HasSuffix(elStr, ">") { + elStr += ">" + } + n, err := html.Parse(strings.NewReader(elStr)) + if err != nil { + return + } + var walk func(*html.Node) + walk = func(n *html.Node) { + if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) { + el.Tag = n.Data + + for _, a := range n.Attr { + switch { + case strings.EqualFold(a.Key, "id"): + // There should be only one, but one never knows... + el.IDs = append(el.IDs, a.Val) + default: + if strings.EqualFold(a.Key, "class") { + el.Classes = append(el.Classes, strings.Fields(a.Val)...) + } else { + key := strings.ToLower(a.Key) + val := strings.TrimSpace(a.Val) + if strings.Contains(key, "class") && strings.HasPrefix(val, "{") { + // This looks like a Vue or AlpineJS class binding. + // Try to unmarshal it as YAML and pull the keys. + // This may look odd, as the source is (probably) JS (JSON), but the YAML + // parser is much more lenient with simple JS input, it seems. + m := make(map[string]interface{}) + val = htmlJsonFixer.Replace(strings.Trim(val, "{}")) + // Remove leading space to make it look like YAML. + lines := strings.Split(val, "\n") + for i, l := range lines { + lines[i] = strings.TrimSpace(l) + } + val = strings.Join(lines, "\n") + err := yaml.Unmarshal([]byte(val), &m) + if err == nil { + for k := range m { + el.Classes = append(el.Classes, strings.Fields(k)...) + } + } else { + // Just insert the raw values. This is used for CSS class pruning + // so, it's important not to leave out values that may be a CSS class. + el.Classes = append(el.Classes, strings.Fields(val)...) + } + } + } + } + } + } + + for c := n.FirstChild; c != nil; c = c.NextSibling { + walk(c) + } + } + + walk(n) + + return +} diff --git a/publisher/htmlElementsCollector_test.go b/publisher/htmlElementsCollector_test.go new file mode 100644 index 00000000..3ef159d8 --- /dev/null +++ b/publisher/htmlElementsCollector_test.go @@ -0,0 +1,81 @@ +// Copyright 2020 The Hugo Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package publisher + +import ( + "fmt" + "strings" + "testing" + + qt "github.com/frankban/quicktest" +) + +func TestClassCollector(t *testing.T) { + c := qt.New((t)) + + f := func(tags, classes, ids string) HTMLElements { + var tagss, classess, idss []string + if tags != "" { + tagss = strings.Split(tags, " ") + } + if classes != "" { + classess = strings.Split(classes, " ") + } + if ids != "" { + idss = strings.Split(ids, " ") + } + return HTMLElements{ + Tags: tagss, + Classes: classess, + IDs: idss, + } + } + + for _, test := range []struct { + name string + html string + expect HTMLElements + }{ + {"basic", ``, f("body", "a b", "")}, + {"duplicates", `
`, f("div", "a b", "")}, + {"single quote", ``, f("body", "a b", "")}, + {"no quote", ``, f("body", "b", "myelement")}, + + {"AlpineJS bind 1", ` +
+
+ `, f("body div", "class1 class2 class3", "")}, + + {"Alpine bind 2", `
FOO
`, + f("div", "bg-black bg-gray-300 inline-block mb-2 mr-1 px-2 py-2 rounded", "")}, + + {"Alpine bind 3", `
`, f("div", "text-gray-800 text-white", "")}, + {"Alpine bind 4", `
`, f("div", "text-gray-800 text-white", "")}, + + {"Vue bind", `
`, f("div", "active", "")}, + } { + c.Run(test.name, func(c *qt.C) { + w := newHTMLElementsCollectorWriter(newHTMLElementsCollector()) + fmt.Fprint(w, test.html) + got := w.collector.getHTMLElements() + c.Assert(got, qt.DeepEquals, test.expect) + }) + } + +} diff --git a/publisher/publisher.go b/publisher/publisher.go index f30073c0..8b8d2fa6 100644 --- a/publisher/publisher.go +++ b/publisher/publisher.go @@ -1,4 +1,4 @@ -// Copyright 2019 The Hugo Authors. All rights reserved. +// Copyright 2020 The Hugo Authors. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -18,7 +18,8 @@ import ( "io" "sync/atomic" - "github.com/gohugoio/hugo/config" + "github.com/gohugoio/hugo/resources" + "github.com/gohugoio/hugo/media" "github.com/gohugoio/hugo/minifiers" @@ -68,17 +69,21 @@ type Descriptor struct { // DestinationPublisher is the default and currently only publisher in Hugo. This // publisher prepares and publishes an item to the defined destination, e.g. /public. type DestinationPublisher struct { - fs afero.Fs - min minifiers.Client + fs afero.Fs + min minifiers.Client + htmlElementsCollector *htmlElementsCollector } // NewDestinationPublisher creates a new DestinationPublisher. -func NewDestinationPublisher(fs afero.Fs, outputFormats output.Formats, mediaTypes media.Types, cfg config.Provider) (pub DestinationPublisher, err error) { - pub = DestinationPublisher{fs: fs} - pub.min, err = minifiers.New(mediaTypes, outputFormats, cfg) - if err != nil { - return +func NewDestinationPublisher(rs *resources.Spec, outputFormats output.Formats, mediaTypes media.Types) (pub DestinationPublisher, err error) { + fs := rs.BaseFs.PublishFs + cfg := rs.Cfg + var classCollector *htmlElementsCollector + if rs.BuildConfig.WriteStats { + classCollector = newHTMLElementsCollector() } + pub = DestinationPublisher{fs: fs, htmlElementsCollector: classCollector} + pub.min, err = minifiers.New(mediaTypes, outputFormats, cfg) return } @@ -111,16 +116,38 @@ func (p DestinationPublisher) Publish(d Descriptor) error { } defer f.Close() - _, err = io.Copy(f, src) + var w io.Writer = f + + if p.htmlElementsCollector != nil && d.OutputFormat.IsHTML { + w = io.MultiWriter(w, newHTMLElementsCollectorWriter(p.htmlElementsCollector)) + } + + _, err = io.Copy(w, src) if err == nil && d.StatCounter != nil { atomic.AddUint64(d.StatCounter, uint64(1)) } + return err } +func (p DestinationPublisher) PublishStats() PublishStats { + if p.htmlElementsCollector == nil { + return PublishStats{} + } + + return PublishStats{ + HTMLElements: p.htmlElementsCollector.getHTMLElements(), + } +} + +type PublishStats struct { + HTMLElements HTMLElements `json:"htmlElements"` +} + // Publisher publishes a result file. type Publisher interface { Publish(d Descriptor) error + PublishStats() PublishStats } // XML transformer := transform.New(urlreplacers.NewAbsURLInXMLTransformer(path)) diff --git a/publisher/publisher_test.go b/publisher/publisher_test.go deleted file mode 100644 index 200accc8..00000000 --- a/publisher/publisher_test.go +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright 2018 The Hugo Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package publisher