111
|
1 // Copyright 2010 The Go Authors. All rights reserved.
|
|
2 // Use of this source code is governed by a BSD-style
|
|
3 // license that can be found in the LICENSE file.
|
|
4
|
|
5 // Package html provides functions for escaping and unescaping HTML text.
|
|
6 package html
|
|
7
|
|
8 import (
|
|
9 "strings"
|
|
10 "unicode/utf8"
|
|
11 )
|
|
12
|
|
13 // These replacements permit compatibility with old numeric entities that
|
|
14 // assumed Windows-1252 encoding.
|
|
15 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
|
16 var replacementTable = [...]rune{
|
|
17 '\u20AC', // First entry is what 0x80 should be replaced with.
|
|
18 '\u0081',
|
|
19 '\u201A',
|
|
20 '\u0192',
|
|
21 '\u201E',
|
|
22 '\u2026',
|
|
23 '\u2020',
|
|
24 '\u2021',
|
|
25 '\u02C6',
|
|
26 '\u2030',
|
|
27 '\u0160',
|
|
28 '\u2039',
|
|
29 '\u0152',
|
|
30 '\u008D',
|
|
31 '\u017D',
|
|
32 '\u008F',
|
|
33 '\u0090',
|
|
34 '\u2018',
|
|
35 '\u2019',
|
|
36 '\u201C',
|
|
37 '\u201D',
|
|
38 '\u2022',
|
|
39 '\u2013',
|
|
40 '\u2014',
|
|
41 '\u02DC',
|
|
42 '\u2122',
|
|
43 '\u0161',
|
|
44 '\u203A',
|
|
45 '\u0153',
|
|
46 '\u009D',
|
|
47 '\u017E',
|
|
48 '\u0178', // Last entry is 0x9F.
|
|
49 // 0x00->'\uFFFD' is handled programmatically.
|
|
50 // 0x0D->'\u000D' is a no-op.
|
|
51 }
|
|
52
|
|
53 // unescapeEntity reads an entity like "<" from b[src:] and writes the
|
|
54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
|
|
55 // Precondition: b[src] == '&' && dst <= src.
|
|
56 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
|
|
57 const attribute = false
|
|
58
|
|
59 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
|
|
60
|
|
61 // i starts at 1 because we already know that s[0] == '&'.
|
|
62 i, s := 1, b[src:]
|
|
63
|
|
64 if len(s) <= 1 {
|
|
65 b[dst] = b[src]
|
|
66 return dst + 1, src + 1
|
|
67 }
|
|
68
|
|
69 if s[i] == '#' {
|
|
70 if len(s) <= 3 { // We need to have at least "&#.".
|
|
71 b[dst] = b[src]
|
|
72 return dst + 1, src + 1
|
|
73 }
|
|
74 i++
|
|
75 c := s[i]
|
|
76 hex := false
|
|
77 if c == 'x' || c == 'X' {
|
|
78 hex = true
|
|
79 i++
|
|
80 }
|
|
81
|
|
82 x := '\x00'
|
|
83 for i < len(s) {
|
|
84 c = s[i]
|
|
85 i++
|
|
86 if hex {
|
|
87 if '0' <= c && c <= '9' {
|
|
88 x = 16*x + rune(c) - '0'
|
|
89 continue
|
|
90 } else if 'a' <= c && c <= 'f' {
|
|
91 x = 16*x + rune(c) - 'a' + 10
|
|
92 continue
|
|
93 } else if 'A' <= c && c <= 'F' {
|
|
94 x = 16*x + rune(c) - 'A' + 10
|
|
95 continue
|
|
96 }
|
|
97 } else if '0' <= c && c <= '9' {
|
|
98 x = 10*x + rune(c) - '0'
|
|
99 continue
|
|
100 }
|
|
101 if c != ';' {
|
|
102 i--
|
|
103 }
|
|
104 break
|
|
105 }
|
|
106
|
|
107 if i <= 3 { // No characters matched.
|
|
108 b[dst] = b[src]
|
|
109 return dst + 1, src + 1
|
|
110 }
|
|
111
|
|
112 if 0x80 <= x && x <= 0x9F {
|
|
113 // Replace characters from Windows-1252 with UTF-8 equivalents.
|
|
114 x = replacementTable[x-0x80]
|
|
115 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
|
|
116 // Replace invalid characters with the replacement character.
|
|
117 x = '\uFFFD'
|
|
118 }
|
|
119
|
|
120 return dst + utf8.EncodeRune(b[dst:], x), src + i
|
|
121 }
|
|
122
|
|
123 // Consume the maximum number of characters possible, with the
|
|
124 // consumed characters matching one of the named references.
|
|
125
|
|
126 for i < len(s) {
|
|
127 c := s[i]
|
|
128 i++
|
|
129 // Lower-cased characters are more common in entities, so we check for them first.
|
|
130 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
|
|
131 continue
|
|
132 }
|
|
133 if c != ';' {
|
|
134 i--
|
|
135 }
|
|
136 break
|
|
137 }
|
|
138
|
|
139 entityName := s[1:i]
|
|
140 if len(entityName) == 0 {
|
|
141 // No-op.
|
|
142 } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
|
|
143 // No-op.
|
|
144 } else if x := entity[string(entityName)]; x != 0 {
|
|
145 return dst + utf8.EncodeRune(b[dst:], x), src + i
|
|
146 } else if x := entity2[string(entityName)]; x[0] != 0 {
|
|
147 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
|
|
148 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
|
|
149 } else if !attribute {
|
|
150 maxLen := len(entityName) - 1
|
|
151 if maxLen > longestEntityWithoutSemicolon {
|
|
152 maxLen = longestEntityWithoutSemicolon
|
|
153 }
|
|
154 for j := maxLen; j > 1; j-- {
|
|
155 if x := entity[string(entityName[:j])]; x != 0 {
|
|
156 return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
|
|
157 }
|
|
158 }
|
|
159 }
|
|
160
|
|
161 dst1, src1 = dst+i, src+i
|
|
162 copy(b[dst:dst1], b[src:src1])
|
|
163 return dst1, src1
|
|
164 }
|
|
165
|
|
166 var htmlEscaper = strings.NewReplacer(
|
|
167 `&`, "&",
|
|
168 `'`, "'", // "'" is shorter than "'" and apos was not in HTML until HTML5.
|
|
169 `<`, "<",
|
|
170 `>`, ">",
|
|
171 `"`, """, // """ is shorter than """.
|
|
172 )
|
|
173
|
|
174 // EscapeString escapes special characters like "<" to become "<". It
|
|
175 // escapes only five such characters: <, >, &, ' and ".
|
|
176 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
|
177 // always true.
|
|
178 func EscapeString(s string) string {
|
|
179 return htmlEscaper.Replace(s)
|
|
180 }
|
|
181
|
|
182 // UnescapeString unescapes entities like "<" to become "<". It unescapes a
|
|
183 // larger range of entities than EscapeString escapes. For example, "á"
|
|
184 // unescapes to "รก", as does "á" and "á".
|
|
185 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
|
|
186 // always true.
|
|
187 func UnescapeString(s string) string {
|
131
|
188 populateMapsOnce.Do(populateMaps)
|
111
|
189 i := strings.IndexByte(s, '&')
|
|
190
|
|
191 if i < 0 {
|
|
192 return s
|
|
193 }
|
|
194
|
|
195 b := []byte(s)
|
|
196 dst, src := unescapeEntity(b, i, i)
|
|
197 for len(s[src:]) > 0 {
|
|
198 if s[src] == '&' {
|
|
199 i = 0
|
|
200 } else {
|
|
201 i = strings.IndexByte(s[src:], '&')
|
|
202 }
|
|
203 if i < 0 {
|
|
204 dst += copy(b[dst:], s[src:])
|
|
205 break
|
|
206 }
|
|
207
|
|
208 if i > 0 {
|
|
209 copy(b[dst:], s[src:src+i])
|
|
210 }
|
|
211 dst, src = unescapeEntity(b, dst+i, src+i)
|
|
212 }
|
|
213 return string(b[:dst])
|
|
214 }
|