wip: strip xmlMode / htmlMode

This commit is contained in:
Evan You 2023-11-12 17:01:05 +08:00
parent 5b9141cc30
commit bc37eae8b0
2 changed files with 30 additions and 65 deletions

View File

@ -102,16 +102,6 @@ const htmlIntegrationElements = new Set([
]) ])
export interface ParserOptions { export interface ParserOptions {
/**
* Indicates whether special tags (`<script>`, `<style>`, and `<title>`) should get special treatment
* and if "empty" tags (eg. `<br>`) can have children. If `false`, the content of special tags
* will be text only. For feeds and other XML content (documents that don't consist of HTML),
* set this to `true`.
*
* @default false
*/
xmlMode?: boolean
/** /**
* Decode entities within the document. * Decode entities within the document.
* *
@ -122,14 +112,14 @@ export interface ParserOptions {
/** /**
* If set to true, all tags will be lowercased. * If set to true, all tags will be lowercased.
* *
* @default !xmlMode * @default true
*/ */
lowerCaseTags?: boolean lowerCaseTags?: boolean
/** /**
* If set to `true`, all attribute names will be lowercased. This has noticeable impact on speed. * If set to `true`, all attribute names will be lowercased. This has noticeable impact on speed.
* *
* @default !xmlMode * @default true
*/ */
lowerCaseAttributeNames?: boolean lowerCaseAttributeNames?: boolean
@ -137,7 +127,7 @@ export interface ParserOptions {
* If set to true, CDATA sections will be recognized as text even if the xmlMode option is not enabled. * If set to true, CDATA sections will be recognized as text even if the xmlMode option is not enabled.
* NOTE: If xmlMode is set to `true` then CDATA sections will always be recognized as text. * NOTE: If xmlMode is set to `true` then CDATA sections will always be recognized as text.
* *
* @default xmlMode * @default false
*/ */
recognizeCDATA?: boolean recognizeCDATA?: boolean
@ -145,7 +135,7 @@ export interface ParserOptions {
* If set to `true`, self-closing tags will trigger the onclosetag event even if xmlMode is not set to `true`. * If set to `true`, self-closing tags will trigger the onclosetag event even if xmlMode is not set to `true`.
* NOTE: If xmlMode is set to `true` then self-closing tags will always be recognized. * NOTE: If xmlMode is set to `true` then self-closing tags will always be recognized.
* *
* @default xmlMode * @default false
*/ */
recognizeSelfClosing?: boolean recognizeSelfClosing?: boolean
@ -218,8 +208,6 @@ export class Parser implements Callbacks {
private readonly lowerCaseTagNames: boolean private readonly lowerCaseTagNames: boolean
private readonly lowerCaseAttributeNames: boolean private readonly lowerCaseAttributeNames: boolean
private readonly recognizeSelfClosing: boolean private readonly recognizeSelfClosing: boolean
/** We are parsing HTML. Inverse of the `xmlMode` option. */
private readonly htmlMode: boolean
private readonly tokenizer: Tokenizer private readonly tokenizer: Tokenizer
private readonly buffers: string[] = [] private readonly buffers: string[] = []
@ -234,13 +222,11 @@ export class Parser implements Callbacks {
private readonly options: ParserOptions = {} private readonly options: ParserOptions = {}
) { ) {
this.cbs = cbs ?? {} this.cbs = cbs ?? {}
this.htmlMode = !this.options.xmlMode this.lowerCaseTagNames = options.lowerCaseTags ?? true
this.lowerCaseTagNames = options.lowerCaseTags ?? this.htmlMode this.lowerCaseAttributeNames = options.lowerCaseAttributeNames ?? true
this.lowerCaseAttributeNames = this.recognizeSelfClosing = options.recognizeSelfClosing ?? false
options.lowerCaseAttributeNames ?? this.htmlMode
this.recognizeSelfClosing = options.recognizeSelfClosing ?? !this.htmlMode
this.tokenizer = new (options.Tokenizer ?? Tokenizer)(this.options, this) this.tokenizer = new (options.Tokenizer ?? Tokenizer)(this.options, this)
this.foreignContext = [!this.htmlMode] this.foreignContext = [false]
this.cbs.onparserinit?.(this) this.cbs.onparserinit?.(this)
} }
@ -266,7 +252,7 @@ export class Parser implements Callbacks {
* to specify your own additional void elements. * to specify your own additional void elements.
*/ */
protected isVoidElement(name: string): boolean { protected isVoidElement(name: string): boolean {
return this.htmlMode && voidElements.has(name) return voidElements.has(name)
} }
/** @internal */ /** @internal */
@ -286,7 +272,7 @@ export class Parser implements Callbacks {
this.openTagStart = this.startIndex this.openTagStart = this.startIndex
this.tagname = name this.tagname = name
const impliesClose = this.htmlMode && openImpliesClose.get(name) const impliesClose = openImpliesClose.get(name)
if (impliesClose) { if (impliesClose) {
while (this.stack.length > 0 && impliesClose.has(this.stack[0])) { while (this.stack.length > 0 && impliesClose.has(this.stack[0])) {
@ -297,12 +283,10 @@ export class Parser implements Callbacks {
if (!this.isVoidElement(name)) { if (!this.isVoidElement(name)) {
this.stack.unshift(name) this.stack.unshift(name)
if (this.htmlMode) { if (foreignContextElements.has(name)) {
if (foreignContextElements.has(name)) { this.foreignContext.unshift(true)
this.foreignContext.unshift(true) } else if (htmlIntegrationElements.has(name)) {
} else if (htmlIntegrationElements.has(name)) { this.foreignContext.unshift(false)
this.foreignContext.unshift(false)
}
} }
} }
this.cbs.onopentagname?.(name) this.cbs.onopentagname?.(name)
@ -342,10 +326,7 @@ export class Parser implements Callbacks {
name = name.toLowerCase() name = name.toLowerCase()
} }
if ( if (foreignContextElements.has(name) || htmlIntegrationElements.has(name)) {
this.htmlMode &&
(foreignContextElements.has(name) || htmlIntegrationElements.has(name))
) {
this.foreignContext.shift() this.foreignContext.shift()
} }
@ -357,12 +338,12 @@ export class Parser implements Callbacks {
// We know the stack has sufficient elements. // We know the stack has sufficient elements.
this.cbs.onclosetag?.(element, index !== pos) this.cbs.onclosetag?.(element, index !== pos)
} }
} else if (this.htmlMode && name === 'p') { } else if (name === 'p') {
// Implicit open before close // Implicit open before close
this.emitOpenTag('p') this.emitOpenTag('p')
this.closeCurrentTag(true) this.closeCurrentTag(true)
} }
} else if (this.htmlMode && name === 'br') { } else if (name === 'br') {
// We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed. // We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
this.cbs.onopentagname?.('br') this.cbs.onopentagname?.('br')
this.cbs.onopentag?.('br', {}, true) this.cbs.onopentag?.('br', {}, true)
@ -497,7 +478,7 @@ export class Parser implements Callbacks {
this.endIndex = endIndex this.endIndex = endIndex
const value = this.getSlice(start, endIndex - offset) const value = this.getSlice(start, endIndex - offset)
if (!this.htmlMode || this.options.recognizeCDATA) { if (this.options.recognizeCDATA) {
this.cbs.oncdatastart?.() this.cbs.oncdatastart?.()
this.cbs.ontext?.(value) this.cbs.ontext?.(value)
this.cbs.oncdataend?.() this.cbs.oncdataend?.()
@ -537,7 +518,7 @@ export class Parser implements Callbacks {
this.cbs.onparserinit?.(this) this.cbs.onparserinit?.(this)
this.buffers.length = 0 this.buffers.length = 0
this.foreignContext.length = 0 this.foreignContext.length = 0
this.foreignContext.unshift(!this.htmlMode) this.foreignContext.unshift(false)
this.bufferOffset = 0 this.bufferOffset = 0
this.writeIndex = 0 this.writeIndex = 0
this.ended = false this.ended = false

View File

@ -1,8 +1,7 @@
import { import {
EntityDecoder, EntityDecoder,
DecodingMode, DecodingMode,
htmlDecodeTree, htmlDecodeTree
xmlDecodeTree
} from 'entities/lib/decode.js' } from 'entities/lib/decode.js'
const enum CharCodes { const enum CharCodes {
@ -89,13 +88,6 @@ function isEndOfTagSection(c: number): boolean {
return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c) return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c)
} }
function isASCIIAlpha(c: number): boolean {
return (
(c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
(c >= CharCodes.UpperA && c <= CharCodes.UpperZ)
)
}
export enum QuoteType { export enum QuoteType {
NoValue = 0, NoValue = 0,
Unquoted = 1, Unquoted = 1,
@ -156,22 +148,16 @@ export default class Tokenizer {
/** The offset of the current buffer. */ /** The offset of the current buffer. */
private offset = 0 private offset = 0
private readonly xmlMode: boolean
private readonly decodeEntities: boolean private readonly decodeEntities: boolean
private readonly entityDecoder: EntityDecoder private readonly entityDecoder: EntityDecoder
constructor( constructor(
{ { decodeEntities = true }: { decodeEntities?: boolean },
xmlMode = false,
decodeEntities = true
}: { xmlMode?: boolean; decodeEntities?: boolean },
private readonly cbs: Callbacks private readonly cbs: Callbacks
) { ) {
this.xmlMode = xmlMode
this.decodeEntities = decodeEntities this.decodeEntities = decodeEntities
this.entityDecoder = new EntityDecoder( this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
xmlMode ? xmlDecodeTree : htmlDecodeTree, this.emitCodePoint(cp, consumed)
(cp, consumed) => this.emitCodePoint(cp, consumed)
) )
} }
@ -358,12 +344,12 @@ export default class Tokenizer {
/** /**
* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name. * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
*
* XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar).
* We allow anything that wouldn't end the tag.
*/ */
private isTagStartChar(c: number) { private isTagStartChar(c: number) {
return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c) return (
(c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
(c >= CharCodes.UpperA && c <= CharCodes.UpperZ)
)
} }
private startSpecial(sequence: Uint8Array, offset: number) { private startSpecial(sequence: Uint8Array, offset: number) {
@ -383,11 +369,11 @@ export default class Tokenizer {
} else if (this.isTagStartChar(c)) { } else if (this.isTagStartChar(c)) {
const lower = c | 0x20 const lower = c | 0x20
this.sectionStart = this.index this.sectionStart = this.index
if (!this.xmlMode && lower === Sequences.TitleEnd[2]) { if (lower === Sequences.TitleEnd[2]) {
this.startSpecial(Sequences.TitleEnd, 3) this.startSpecial(Sequences.TitleEnd, 3)
} else { } else {
this.state = this.state =
!this.xmlMode && lower === Sequences.ScriptEnd[2] lower === Sequences.ScriptEnd[2]
? State.BeforeSpecialS ? State.BeforeSpecialS
: State.InTagName : State.InTagName
} }
@ -584,9 +570,7 @@ export default class Tokenizer {
this.state = State.InEntity this.state = State.InEntity
this.entityStart = this.index this.entityStart = this.index
this.entityDecoder.startEntity( this.entityDecoder.startEntity(
this.xmlMode this.baseState === State.Text || this.baseState === State.InSpecialTag
? DecodingMode.Strict
: this.baseState === State.Text || this.baseState === State.InSpecialTag
? DecodingMode.Legacy ? DecodingMode.Legacy
: DecodingMode.Attribute : DecodingMode.Attribute
) )