1// Copyright 2013 The Go Authors.  All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// This program takes an HTML file and outputs a corresponding article file in
6// present format. See: golang.org/x/tools/present
7package main // import "golang.org/x/tools/cmd/html2article"
8
9import (
10	"bytes"
11	"errors"
12	"flag"
13	"fmt"
14	"io"
15	"log"
16	"net/url"
17	"os"
18	"regexp"
19	"strings"
20
21	"golang.org/x/net/html"
22	"golang.org/x/net/html/atom"
23)
24
25func main() {
26	flag.Parse()
27
28	err := convert(os.Stdout, os.Stdin)
29	if err != nil {
30		log.Fatal(err)
31	}
32}
33
34func convert(w io.Writer, r io.Reader) error {
35	root, err := html.Parse(r)
36	if err != nil {
37		return err
38	}
39
40	style := find(root, isTag(atom.Style))
41	if err := parseStyles(style); err != nil {
42		log.Printf("couldn't parse all styles: %v", err)
43	}
44
45	body := find(root, isTag(atom.Body))
46	if body == nil {
47		return errors.New("couldn't find body")
48	}
49	article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body))))
50	_, err = fmt.Fprintf(w, "Title\n\n%s", article)
51	return err
52}
53
54type Style string
55
56const (
57	Bold   Style = "*"
58	Italic Style = "_"
59	Code   Style = "`"
60)
61
62var cssRules = make(map[string]Style)
63
64func parseStyles(style *html.Node) error {
65	if style == nil || style.FirstChild == nil {
66		return errors.New("couldn't find styles")
67	}
68
69	styles := style.FirstChild.Data
70	readUntil := func(end rune) (string, bool) {
71		i := strings.IndexRune(styles, end)
72		if i < 0 {
73			return "", false
74		}
75		s := styles[:i]
76		styles = styles[i:]
77		return s, true
78	}
79
80	for {
81		sel, ok := readUntil('{')
82		if !ok && sel == "" {
83			break
84		} else if !ok {
85			return fmt.Errorf("could not parse selector %q", styles)
86		}
87
88		value, ok := readUntil('}')
89		if !ok {
90			return fmt.Errorf("couldn't parse style body for %s", sel)
91		}
92		switch {
93		case strings.Contains(value, "italic"):
94			cssRules[sel] = Italic
95		case strings.Contains(value, "bold"):
96			cssRules[sel] = Bold
97		case strings.Contains(value, "Consolas") || strings.Contains(value, "Courier New"):
98			cssRules[sel] = Code
99		}
100	}
101	return nil
102}
103
104var newlineRun = regexp.MustCompile(`\n\n+`)
105
106func limitNewlineRuns(s string) string {
107	return newlineRun.ReplaceAllString(s, "\n\n")
108}
109
110func makeHeadings(body string) string {
111	buf := new(bytes.Buffer)
112	lines := strings.Split(body, "\n")
113	for i, s := range lines {
114		if i == 0 && !isBoldTitle(s) {
115			buf.WriteString("* Introduction\n\n")
116		}
117		if isBoldTitle(s) {
118			s = strings.TrimSpace(strings.Replace(s, "*", " ", -1))
119			s = "* " + s
120		}
121		buf.WriteString(s)
122		buf.WriteByte('\n')
123	}
124	return buf.String()
125}
126
127func isBoldTitle(s string) bool {
128	return !strings.Contains(s, " ") &&
129		strings.HasPrefix(s, "*") &&
130		strings.HasSuffix(s, "*")
131}
132
133func indent(buf *bytes.Buffer, s string) {
134	for _, l := range strings.Split(s, "\n") {
135		if l != "" {
136			buf.WriteByte('\t')
137			buf.WriteString(l)
138		}
139		buf.WriteByte('\n')
140	}
141}
142
143func unwrap(buf *bytes.Buffer, s string) {
144	var cont bool
145	for _, l := range strings.Split(s, "\n") {
146		l = strings.TrimSpace(l)
147		if len(l) == 0 {
148			if cont {
149				buf.WriteByte('\n')
150				buf.WriteByte('\n')
151			}
152			cont = false
153		} else {
154			if cont {
155				buf.WriteByte(' ')
156			}
157			buf.WriteString(l)
158			cont = true
159		}
160	}
161}
162
163func text(n *html.Node) string {
164	var buf bytes.Buffer
165	walk(n, func(n *html.Node) bool {
166		switch n.Type {
167		case html.TextNode:
168			buf.WriteString(n.Data)
169			return false
170		case html.ElementNode:
171			// no-op
172		default:
173			return true
174		}
175		a := n.DataAtom
176		if a == atom.Span {
177			switch {
178			case hasStyle(Code)(n):
179				a = atom.Code
180			case hasStyle(Bold)(n):
181				a = atom.B
182			case hasStyle(Italic)(n):
183				a = atom.I
184			}
185		}
186		switch a {
187		case atom.Br:
188			buf.WriteByte('\n')
189		case atom.P:
190			unwrap(&buf, childText(n))
191			buf.WriteString("\n\n")
192		case atom.Li:
193			buf.WriteString("- ")
194			unwrap(&buf, childText(n))
195			buf.WriteByte('\n')
196		case atom.Pre:
197			indent(&buf, childText(n))
198			buf.WriteByte('\n')
199		case atom.A:
200			href, text := attr(n, "href"), childText(n)
201			// Skip links with no text.
202			if strings.TrimSpace(text) == "" {
203				break
204			}
205			// Don't emit empty links.
206			if strings.TrimSpace(href) == "" {
207				buf.WriteString(text)
208				break
209			}
210			// Use original url for Google Docs redirections.
211			if u, err := url.Parse(href); err != nil {
212				log.Printf("parsing url %q: %v", href, err)
213			} else if u.Host == "www.google.com" && u.Path == "/url" {
214				href = u.Query().Get("q")
215			}
216			fmt.Fprintf(&buf, "[[%s][%s]]", href, text)
217		case atom.Code:
218			buf.WriteString(highlight(n, "`"))
219		case atom.B:
220			buf.WriteString(highlight(n, "*"))
221		case atom.I:
222			buf.WriteString(highlight(n, "_"))
223		case atom.Img:
224			src := attr(n, "src")
225			fmt.Fprintf(&buf, ".image %s\n", src)
226		case atom.Iframe:
227			src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height")
228			fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w)
229		case atom.Param:
230			if attr(n, "name") == "movie" {
231				// Old style YouTube embed.
232				u := attr(n, "value")
233				u = strings.Replace(u, "/v/", "/embed/", 1)
234				if i := strings.Index(u, "&"); i >= 0 {
235					u = u[:i]
236				}
237				fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u)
238			}
239		case atom.Title:
240		default:
241			return true
242		}
243		return false
244	})
245	return buf.String()
246}
247
248func childText(node *html.Node) string {
249	var buf bytes.Buffer
250	for n := node.FirstChild; n != nil; n = n.NextSibling {
251		fmt.Fprint(&buf, text(n))
252	}
253	return buf.String()
254}
255
256func highlight(node *html.Node, char string) string {
257	t := strings.Replace(childText(node), " ", char, -1)
258	return fmt.Sprintf("%s%s%s", char, t, char)
259}
260
261type selector func(*html.Node) bool
262
263func isTag(a atom.Atom) selector {
264	return func(n *html.Node) bool {
265		return n.DataAtom == a
266	}
267}
268
269func hasClass(name string) selector {
270	return func(n *html.Node) bool {
271		for _, a := range n.Attr {
272			if a.Key == "class" {
273				for _, c := range strings.Fields(a.Val) {
274					if c == name {
275						return true
276					}
277				}
278			}
279		}
280		return false
281	}
282}
283
284func hasStyle(s Style) selector {
285	return func(n *html.Node) bool {
286		for rule, s2 := range cssRules {
287			if s2 != s {
288				continue
289			}
290			if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) {
291				return true
292			}
293			if n.DataAtom.String() == rule {
294				return true
295			}
296		}
297		return false
298	}
299}
300
301func attr(node *html.Node, key string) (value string) {
302	for _, attr := range node.Attr {
303		if attr.Key == key {
304			return attr.Val
305		}
306	}
307	return ""
308}
309
310func find(n *html.Node, fn selector) *html.Node {
311	var result *html.Node
312	walk(n, func(n *html.Node) bool {
313		if result != nil {
314			return false
315		}
316		if fn(n) {
317			result = n
318			return false
319		}
320		return true
321	})
322	return result
323}
324
325func walk(n *html.Node, fn selector) {
326	if fn(n) {
327		for c := n.FirstChild; c != nil; c = c.NextSibling {
328			walk(c, fn)
329		}
330	}
331}
332