annotate libgo/go/html/escape.go @ 158:494b0b89df80 default tip

...
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 25 May 2020 18:13:55 +0900
parents 84e7813d76e9
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
111
kono
parents:
diff changeset
1 // Copyright 2010 The Go Authors. All rights reserved.
kono
parents:
diff changeset
2 // Use of this source code is governed by a BSD-style
kono
parents:
diff changeset
3 // license that can be found in the LICENSE file.
kono
parents:
diff changeset
4
kono
parents:
diff changeset
5 // Package html provides functions for escaping and unescaping HTML text.
kono
parents:
diff changeset
6 package html
kono
parents:
diff changeset
7
kono
parents:
diff changeset
8 import (
kono
parents:
diff changeset
9 "strings"
kono
parents:
diff changeset
10 "unicode/utf8"
kono
parents:
diff changeset
11 )
kono
parents:
diff changeset
12
kono
parents:
diff changeset
13 // These replacements permit compatibility with old numeric entities that
kono
parents:
diff changeset
14 // assumed Windows-1252 encoding.
kono
parents:
diff changeset
15 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
kono
parents:
diff changeset
16 var replacementTable = [...]rune{
kono
parents:
diff changeset
17 '\u20AC', // First entry is what 0x80 should be replaced with.
kono
parents:
diff changeset
18 '\u0081',
kono
parents:
diff changeset
19 '\u201A',
kono
parents:
diff changeset
20 '\u0192',
kono
parents:
diff changeset
21 '\u201E',
kono
parents:
diff changeset
22 '\u2026',
kono
parents:
diff changeset
23 '\u2020',
kono
parents:
diff changeset
24 '\u2021',
kono
parents:
diff changeset
25 '\u02C6',
kono
parents:
diff changeset
26 '\u2030',
kono
parents:
diff changeset
27 '\u0160',
kono
parents:
diff changeset
28 '\u2039',
kono
parents:
diff changeset
29 '\u0152',
kono
parents:
diff changeset
30 '\u008D',
kono
parents:
diff changeset
31 '\u017D',
kono
parents:
diff changeset
32 '\u008F',
kono
parents:
diff changeset
33 '\u0090',
kono
parents:
diff changeset
34 '\u2018',
kono
parents:
diff changeset
35 '\u2019',
kono
parents:
diff changeset
36 '\u201C',
kono
parents:
diff changeset
37 '\u201D',
kono
parents:
diff changeset
38 '\u2022',
kono
parents:
diff changeset
39 '\u2013',
kono
parents:
diff changeset
40 '\u2014',
kono
parents:
diff changeset
41 '\u02DC',
kono
parents:
diff changeset
42 '\u2122',
kono
parents:
diff changeset
43 '\u0161',
kono
parents:
diff changeset
44 '\u203A',
kono
parents:
diff changeset
45 '\u0153',
kono
parents:
diff changeset
46 '\u009D',
kono
parents:
diff changeset
47 '\u017E',
kono
parents:
diff changeset
48 '\u0178', // Last entry is 0x9F.
kono
parents:
diff changeset
49 // 0x00->'\uFFFD' is handled programmatically.
kono
parents:
diff changeset
50 // 0x0D->'\u000D' is a no-op.
kono
parents:
diff changeset
51 }
kono
parents:
diff changeset
52
kono
parents:
diff changeset
53 // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
kono
parents:
diff changeset
54 // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
kono
parents:
diff changeset
55 // Precondition: b[src] == '&' && dst <= src.
kono
parents:
diff changeset
56 func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
kono
parents:
diff changeset
57 const attribute = false
kono
parents:
diff changeset
58
kono
parents:
diff changeset
59 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
kono
parents:
diff changeset
60
kono
parents:
diff changeset
61 // i starts at 1 because we already know that s[0] == '&'.
kono
parents:
diff changeset
62 i, s := 1, b[src:]
kono
parents:
diff changeset
63
kono
parents:
diff changeset
64 if len(s) <= 1 {
kono
parents:
diff changeset
65 b[dst] = b[src]
kono
parents:
diff changeset
66 return dst + 1, src + 1
kono
parents:
diff changeset
67 }
kono
parents:
diff changeset
68
kono
parents:
diff changeset
69 if s[i] == '#' {
kono
parents:
diff changeset
70 if len(s) <= 3 { // We need to have at least "&#.".
kono
parents:
diff changeset
71 b[dst] = b[src]
kono
parents:
diff changeset
72 return dst + 1, src + 1
kono
parents:
diff changeset
73 }
kono
parents:
diff changeset
74 i++
kono
parents:
diff changeset
75 c := s[i]
kono
parents:
diff changeset
76 hex := false
kono
parents:
diff changeset
77 if c == 'x' || c == 'X' {
kono
parents:
diff changeset
78 hex = true
kono
parents:
diff changeset
79 i++
kono
parents:
diff changeset
80 }
kono
parents:
diff changeset
81
kono
parents:
diff changeset
82 x := '\x00'
kono
parents:
diff changeset
83 for i < len(s) {
kono
parents:
diff changeset
84 c = s[i]
kono
parents:
diff changeset
85 i++
kono
parents:
diff changeset
86 if hex {
kono
parents:
diff changeset
87 if '0' <= c && c <= '9' {
kono
parents:
diff changeset
88 x = 16*x + rune(c) - '0'
kono
parents:
diff changeset
89 continue
kono
parents:
diff changeset
90 } else if 'a' <= c && c <= 'f' {
kono
parents:
diff changeset
91 x = 16*x + rune(c) - 'a' + 10
kono
parents:
diff changeset
92 continue
kono
parents:
diff changeset
93 } else if 'A' <= c && c <= 'F' {
kono
parents:
diff changeset
94 x = 16*x + rune(c) - 'A' + 10
kono
parents:
diff changeset
95 continue
kono
parents:
diff changeset
96 }
kono
parents:
diff changeset
97 } else if '0' <= c && c <= '9' {
kono
parents:
diff changeset
98 x = 10*x + rune(c) - '0'
kono
parents:
diff changeset
99 continue
kono
parents:
diff changeset
100 }
kono
parents:
diff changeset
101 if c != ';' {
kono
parents:
diff changeset
102 i--
kono
parents:
diff changeset
103 }
kono
parents:
diff changeset
104 break
kono
parents:
diff changeset
105 }
kono
parents:
diff changeset
106
kono
parents:
diff changeset
107 if i <= 3 { // No characters matched.
kono
parents:
diff changeset
108 b[dst] = b[src]
kono
parents:
diff changeset
109 return dst + 1, src + 1
kono
parents:
diff changeset
110 }
kono
parents:
diff changeset
111
kono
parents:
diff changeset
112 if 0x80 <= x && x <= 0x9F {
kono
parents:
diff changeset
113 // Replace characters from Windows-1252 with UTF-8 equivalents.
kono
parents:
diff changeset
114 x = replacementTable[x-0x80]
kono
parents:
diff changeset
115 } else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
kono
parents:
diff changeset
116 // Replace invalid characters with the replacement character.
kono
parents:
diff changeset
117 x = '\uFFFD'
kono
parents:
diff changeset
118 }
kono
parents:
diff changeset
119
kono
parents:
diff changeset
120 return dst + utf8.EncodeRune(b[dst:], x), src + i
kono
parents:
diff changeset
121 }
kono
parents:
diff changeset
122
kono
parents:
diff changeset
123 // Consume the maximum number of characters possible, with the
kono
parents:
diff changeset
124 // consumed characters matching one of the named references.
kono
parents:
diff changeset
125
kono
parents:
diff changeset
126 for i < len(s) {
kono
parents:
diff changeset
127 c := s[i]
kono
parents:
diff changeset
128 i++
kono
parents:
diff changeset
129 // Lower-cased characters are more common in entities, so we check for them first.
kono
parents:
diff changeset
130 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
kono
parents:
diff changeset
131 continue
kono
parents:
diff changeset
132 }
kono
parents:
diff changeset
133 if c != ';' {
kono
parents:
diff changeset
134 i--
kono
parents:
diff changeset
135 }
kono
parents:
diff changeset
136 break
kono
parents:
diff changeset
137 }
kono
parents:
diff changeset
138
kono
parents:
diff changeset
139 entityName := s[1:i]
kono
parents:
diff changeset
140 if len(entityName) == 0 {
kono
parents:
diff changeset
141 // No-op.
kono
parents:
diff changeset
142 } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
kono
parents:
diff changeset
143 // No-op.
kono
parents:
diff changeset
144 } else if x := entity[string(entityName)]; x != 0 {
kono
parents:
diff changeset
145 return dst + utf8.EncodeRune(b[dst:], x), src + i
kono
parents:
diff changeset
146 } else if x := entity2[string(entityName)]; x[0] != 0 {
kono
parents:
diff changeset
147 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
kono
parents:
diff changeset
148 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
kono
parents:
diff changeset
149 } else if !attribute {
kono
parents:
diff changeset
150 maxLen := len(entityName) - 1
kono
parents:
diff changeset
151 if maxLen > longestEntityWithoutSemicolon {
kono
parents:
diff changeset
152 maxLen = longestEntityWithoutSemicolon
kono
parents:
diff changeset
153 }
kono
parents:
diff changeset
154 for j := maxLen; j > 1; j-- {
kono
parents:
diff changeset
155 if x := entity[string(entityName[:j])]; x != 0 {
kono
parents:
diff changeset
156 return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
kono
parents:
diff changeset
157 }
kono
parents:
diff changeset
158 }
kono
parents:
diff changeset
159 }
kono
parents:
diff changeset
160
kono
parents:
diff changeset
161 dst1, src1 = dst+i, src+i
kono
parents:
diff changeset
162 copy(b[dst:dst1], b[src:src1])
kono
parents:
diff changeset
163 return dst1, src1
kono
parents:
diff changeset
164 }
kono
parents:
diff changeset
165
kono
parents:
diff changeset
166 var htmlEscaper = strings.NewReplacer(
kono
parents:
diff changeset
167 `&`, "&amp;",
kono
parents:
diff changeset
168 `'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
kono
parents:
diff changeset
169 `<`, "&lt;",
kono
parents:
diff changeset
170 `>`, "&gt;",
kono
parents:
diff changeset
171 `"`, "&#34;", // "&#34;" is shorter than "&quot;".
kono
parents:
diff changeset
172 )
kono
parents:
diff changeset
173
kono
parents:
diff changeset
174 // EscapeString escapes special characters like "<" to become "&lt;". It
kono
parents:
diff changeset
175 // escapes only five such characters: <, >, &, ' and ".
kono
parents:
diff changeset
176 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
kono
parents:
diff changeset
177 // always true.
kono
parents:
diff changeset
178 func EscapeString(s string) string {
kono
parents:
diff changeset
179 return htmlEscaper.Replace(s)
kono
parents:
diff changeset
180 }
kono
parents:
diff changeset
181
kono
parents:
diff changeset
182 // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
kono
parents:
diff changeset
183 // larger range of entities than EscapeString escapes. For example, "&aacute;"
kono
parents:
diff changeset
184 // unescapes to "รก", as does "&#225;" and "&#xE1;".
kono
parents:
diff changeset
185 // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
kono
parents:
diff changeset
186 // always true.
kono
parents:
diff changeset
187 func UnescapeString(s string) string {
131
84e7813d76e9 gcc-8.2
mir3636
parents: 111
diff changeset
188 populateMapsOnce.Do(populateMaps)
111
kono
parents:
diff changeset
189 i := strings.IndexByte(s, '&')
kono
parents:
diff changeset
190
kono
parents:
diff changeset
191 if i < 0 {
kono
parents:
diff changeset
192 return s
kono
parents:
diff changeset
193 }
kono
parents:
diff changeset
194
kono
parents:
diff changeset
195 b := []byte(s)
kono
parents:
diff changeset
196 dst, src := unescapeEntity(b, i, i)
kono
parents:
diff changeset
197 for len(s[src:]) > 0 {
kono
parents:
diff changeset
198 if s[src] == '&' {
kono
parents:
diff changeset
199 i = 0
kono
parents:
diff changeset
200 } else {
kono
parents:
diff changeset
201 i = strings.IndexByte(s[src:], '&')
kono
parents:
diff changeset
202 }
kono
parents:
diff changeset
203 if i < 0 {
kono
parents:
diff changeset
204 dst += copy(b[dst:], s[src:])
kono
parents:
diff changeset
205 break
kono
parents:
diff changeset
206 }
kono
parents:
diff changeset
207
kono
parents:
diff changeset
208 if i > 0 {
kono
parents:
diff changeset
209 copy(b[dst:], s[src:src+i])
kono
parents:
diff changeset
210 }
kono
parents:
diff changeset
211 dst, src = unescapeEntity(b, dst+i, src+i)
kono
parents:
diff changeset
212 }
kono
parents:
diff changeset
213 return string(b[:dst])
kono
parents:
diff changeset
214 }