Skip to content

Commit 4b880fa

Browse files
committed
Add xml option
Although it's possible to do more extensive sniffing, and we might do so in the future, this is enough to pass the relevant web platform tests we're currently working on in jsdom.
1 parent ea7148c commit 4b880fa

File tree

3 files changed

+114
-7
lines changed

3 files changed

+114
-7
lines changed

README.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,21 @@ const htmlString = (new TextDecoder(sniffedEncoding)).decode(htmlBytes);
2121

2222
## Options
2323

24-
You can pass two potential options to `htmlEncodingSniffer`:
24+
You can pass the following options to `htmlEncodingSniffer`:
2525

2626
```js
2727
const sniffedEncoding = htmlEncodingSniffer(htmlBytes, {
28+
xml,
2829
transportLayerEncodingLabel,
29-
defaultEncoding
30+
defaultEncoding,
3031
});
3132
```
3233

33-
These represent two possible inputs into the [encoding sniffing algorithm](https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm):
34+
The `xml` option is a boolean, defaulting to `false`. If set to `true`, then we bypass the [HTML encoding sniffing algorithm](https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm) and compute the encoding based on the presence of a BOM, or the other options provided. (In the future, we may perform sniffing of the `<?xml?>` declaration, but for now that is not implemented.)
3435

35-
- `transportLayerEncodingLabel` is an encoding label that is obtained from the "transport layer" (probably a HTTP `Content-Type` header), which overrides everything but a BOM.
36-
- `defaultEncoding` is the ultimate fallback encoding used if no valid encoding is supplied by the transport layer, and no encoding is sniffed from the bytes. It defaults to `"windows-1252"`, as recommended by the algorithm's table of suggested defaults for "All other locales" (including the `en` locale).
36+
The `transportLayerEncodingLabel` is an encoding label that is obtained from the "transport layer" (probably a HTTP `Content-Type` header), which overrides everything but a BOM.
37+
38+
The `defaultEncoding` is the ultimate fallback encoding used if no valid encoding is supplied by the transport layer, and no encoding is sniffed from the bytes. For HTML, it defaults to `"windows-1252"`, as recommended by the algorithm's table of suggested defaults for "All other locales" (including the `en` locale). For XML, it defaults to `"UTF-8"`.
3739

3840
## Credits
3941

lib/html-encoding-sniffer.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@
22
const { getBOMEncoding, labelToName } = require("@exodus/bytes/encoding-lite.js");
33

44
// https://html.spec.whatwg.org/#encoding-sniffing-algorithm
5-
module.exports = (uint8Array, { transportLayerEncodingLabel, defaultEncoding = "windows-1252" } = {}) => {
5+
module.exports = (uint8Array, { xml = false, transportLayerEncodingLabel, defaultEncoding } = {}) => {
6+
if (defaultEncoding === undefined) {
7+
defaultEncoding = xml ? "UTF-8" : "windows-1252";
8+
}
9+
610
let encoding = labelToName(getBOMEncoding(uint8Array));
711

812
if (encoding === null && transportLayerEncodingLabel !== undefined) {
913
encoding = labelToName(transportLayerEncodingLabel);
1014
}
1115

12-
if (encoding === null) {
16+
if (encoding === null && !xml) {
1317
encoding = prescanMetaCharset(uint8Array);
1418
}
1519

test/tests.js

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,104 @@ for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/utf"))) {
124124
});
125125
});
126126
}
127+
128+
describe("xml: true", () => {
129+
describe("BOM detection", () => {
130+
for (const file of fs.readdirSync(path.resolve(__dirname, "fixtures/bom"))) {
131+
const buffer = read(`fixtures/bom/${file}`);
132+
const desiredEncoding = path.basename(file, ".html");
133+
134+
it(`should sniff ${file} as ${desiredEncoding}`, () => {
135+
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });
136+
137+
assert.strictEqual(sniffedEncoding, desiredEncoding);
138+
});
139+
140+
it(`should sniff ${file} as ${desiredEncoding}, given overriding options`, () => {
141+
const sniffedEncoding = htmlEncodingSniffer(buffer, {
142+
xml: true,
143+
transportLayerEncodingLabel: "windows-1252",
144+
defaultEncoding: "ISO-8859-1"
145+
});
146+
147+
assert.strictEqual(sniffedEncoding, desiredEncoding);
148+
});
149+
}
150+
});
151+
152+
describe("UTF-32 BOMs (not recognized, should fall back to default)", () => {
153+
it("should ignore UTF-32BE BOM and return UTF-8", () => {
154+
// UTF-32BE BOM: 00 00 FE FF
155+
const buffer = new Uint8Array([0x00, 0x00, 0xFE, 0xFF, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]);
156+
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });
157+
158+
assert.strictEqual(sniffedEncoding, "UTF-8");
159+
});
160+
161+
it("should detect UTF-32LE BOM as UTF-16LE (since FF FE prefix matches)", () => {
162+
// UTF-32LE BOM: FF FE 00 00 — but FF FE is also UTF-16LE BOM
163+
const buffer = new Uint8Array([0xFF, 0xFE, 0x00, 0x00, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]);
164+
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });
165+
166+
assert.strictEqual(sniffedEncoding, "UTF-16LE");
167+
});
168+
169+
it("should ignore UTF-32BE BOM and use transport layer encoding if provided", () => {
170+
const buffer = new Uint8Array([0x00, 0x00, 0xFE, 0xFF, 0x3C, 0x3F, 0x78, 0x6D, 0x6C]);
171+
const sniffedEncoding = htmlEncodingSniffer(buffer, {
172+
xml: true,
173+
transportLayerEncodingLabel: "KOI8-R"
174+
});
175+
176+
assert.strictEqual(sniffedEncoding, "KOI8-R");
177+
});
178+
});
179+
180+
describe("meta charset ignored", () => {
181+
it("should ignore meta charset and return UTF-8 default", () => {
182+
const buffer = read("fixtures/normal/charset_KOI8-R.html");
183+
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });
184+
185+
assert.strictEqual(sniffedEncoding, "UTF-8");
186+
});
187+
188+
it("should ignore meta charset but use transport layer encoding", () => {
189+
const buffer = read("fixtures/normal/charset_KOI8-R.html");
190+
const sniffedEncoding = htmlEncodingSniffer(buffer, {
191+
xml: true,
192+
transportLayerEncodingLabel: "ISO-8859-2"
193+
});
194+
195+
assert.strictEqual(sniffedEncoding, "ISO-8859-2");
196+
});
197+
198+
it("should ignore meta charset but use custom default encoding", () => {
199+
const buffer = read("fixtures/normal/charset_KOI8-R.html");
200+
const sniffedEncoding = htmlEncodingSniffer(buffer, {
201+
xml: true,
202+
defaultEncoding: "windows-1252"
203+
});
204+
205+
assert.strictEqual(sniffedEncoding, "windows-1252");
206+
});
207+
});
208+
209+
describe("default encoding", () => {
210+
it("should default to UTF-8 for XML", () => {
211+
const buffer = read("fixtures/no-result/no-indicators_windows-1252.html");
212+
const sniffedEncoding = htmlEncodingSniffer(buffer, { xml: true });
213+
214+
assert.strictEqual(sniffedEncoding, "UTF-8");
215+
});
216+
217+
it("should allow overriding the default encoding", () => {
218+
const buffer = read("fixtures/no-result/no-indicators_windows-1252.html");
219+
const sniffedEncoding = htmlEncodingSniffer(buffer, {
220+
xml: true,
221+
defaultEncoding: "ISO-8859-1"
222+
});
223+
224+
assert.strictEqual(sniffedEncoding, "ISO-8859-1");
225+
});
226+
});
227+
});

0 commit comments

Comments
 (0)