wip: strip xmlMode / htmlMode
This commit is contained in:
parent
5b9141cc30
commit
bc37eae8b0
|
@ -102,16 +102,6 @@ const htmlIntegrationElements = new Set([
|
||||||
])
|
])
|
||||||
|
|
||||||
export interface ParserOptions {
|
export interface ParserOptions {
|
||||||
/**
|
|
||||||
* Indicates whether special tags (`<script>`, `<style>`, and `<title>`) should get special treatment
|
|
||||||
* and if "empty" tags (eg. `<br>`) can have children. If `false`, the content of special tags
|
|
||||||
* will be text only. For feeds and other XML content (documents that don't consist of HTML),
|
|
||||||
* set this to `true`.
|
|
||||||
*
|
|
||||||
* @default false
|
|
||||||
*/
|
|
||||||
xmlMode?: boolean
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Decode entities within the document.
|
* Decode entities within the document.
|
||||||
*
|
*
|
||||||
|
@ -122,14 +112,14 @@ export interface ParserOptions {
|
||||||
/**
|
/**
|
||||||
* If set to true, all tags will be lowercased.
|
* If set to true, all tags will be lowercased.
|
||||||
*
|
*
|
||||||
* @default !xmlMode
|
* @default true
|
||||||
*/
|
*/
|
||||||
lowerCaseTags?: boolean
|
lowerCaseTags?: boolean
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If set to `true`, all attribute names will be lowercased. This has noticeable impact on speed.
|
* If set to `true`, all attribute names will be lowercased. This has noticeable impact on speed.
|
||||||
*
|
*
|
||||||
* @default !xmlMode
|
* @default true
|
||||||
*/
|
*/
|
||||||
lowerCaseAttributeNames?: boolean
|
lowerCaseAttributeNames?: boolean
|
||||||
|
|
||||||
|
@ -137,7 +127,7 @@ export interface ParserOptions {
|
||||||
* If set to true, CDATA sections will be recognized as text even if the xmlMode option is not enabled.
|
* If set to true, CDATA sections will be recognized as text even if the xmlMode option is not enabled.
|
||||||
* NOTE: If xmlMode is set to `true` then CDATA sections will always be recognized as text.
|
* NOTE: If xmlMode is set to `true` then CDATA sections will always be recognized as text.
|
||||||
*
|
*
|
||||||
* @default xmlMode
|
* @default false
|
||||||
*/
|
*/
|
||||||
recognizeCDATA?: boolean
|
recognizeCDATA?: boolean
|
||||||
|
|
||||||
|
@ -145,7 +135,7 @@ export interface ParserOptions {
|
||||||
* If set to `true`, self-closing tags will trigger the onclosetag event even if xmlMode is not set to `true`.
|
* If set to `true`, self-closing tags will trigger the onclosetag event even if xmlMode is not set to `true`.
|
||||||
* NOTE: If xmlMode is set to `true` then self-closing tags will always be recognized.
|
* NOTE: If xmlMode is set to `true` then self-closing tags will always be recognized.
|
||||||
*
|
*
|
||||||
* @default xmlMode
|
* @default false
|
||||||
*/
|
*/
|
||||||
recognizeSelfClosing?: boolean
|
recognizeSelfClosing?: boolean
|
||||||
|
|
||||||
|
@ -218,8 +208,6 @@ export class Parser implements Callbacks {
|
||||||
private readonly lowerCaseTagNames: boolean
|
private readonly lowerCaseTagNames: boolean
|
||||||
private readonly lowerCaseAttributeNames: boolean
|
private readonly lowerCaseAttributeNames: boolean
|
||||||
private readonly recognizeSelfClosing: boolean
|
private readonly recognizeSelfClosing: boolean
|
||||||
/** We are parsing HTML. Inverse of the `xmlMode` option. */
|
|
||||||
private readonly htmlMode: boolean
|
|
||||||
private readonly tokenizer: Tokenizer
|
private readonly tokenizer: Tokenizer
|
||||||
|
|
||||||
private readonly buffers: string[] = []
|
private readonly buffers: string[] = []
|
||||||
|
@ -234,13 +222,11 @@ export class Parser implements Callbacks {
|
||||||
private readonly options: ParserOptions = {}
|
private readonly options: ParserOptions = {}
|
||||||
) {
|
) {
|
||||||
this.cbs = cbs ?? {}
|
this.cbs = cbs ?? {}
|
||||||
this.htmlMode = !this.options.xmlMode
|
this.lowerCaseTagNames = options.lowerCaseTags ?? true
|
||||||
this.lowerCaseTagNames = options.lowerCaseTags ?? this.htmlMode
|
this.lowerCaseAttributeNames = options.lowerCaseAttributeNames ?? true
|
||||||
this.lowerCaseAttributeNames =
|
this.recognizeSelfClosing = options.recognizeSelfClosing ?? false
|
||||||
options.lowerCaseAttributeNames ?? this.htmlMode
|
|
||||||
this.recognizeSelfClosing = options.recognizeSelfClosing ?? !this.htmlMode
|
|
||||||
this.tokenizer = new (options.Tokenizer ?? Tokenizer)(this.options, this)
|
this.tokenizer = new (options.Tokenizer ?? Tokenizer)(this.options, this)
|
||||||
this.foreignContext = [!this.htmlMode]
|
this.foreignContext = [false]
|
||||||
this.cbs.onparserinit?.(this)
|
this.cbs.onparserinit?.(this)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -266,7 +252,7 @@ export class Parser implements Callbacks {
|
||||||
* to specify your own additional void elements.
|
* to specify your own additional void elements.
|
||||||
*/
|
*/
|
||||||
protected isVoidElement(name: string): boolean {
|
protected isVoidElement(name: string): boolean {
|
||||||
return this.htmlMode && voidElements.has(name)
|
return voidElements.has(name)
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @internal */
|
/** @internal */
|
||||||
|
@ -286,7 +272,7 @@ export class Parser implements Callbacks {
|
||||||
this.openTagStart = this.startIndex
|
this.openTagStart = this.startIndex
|
||||||
this.tagname = name
|
this.tagname = name
|
||||||
|
|
||||||
const impliesClose = this.htmlMode && openImpliesClose.get(name)
|
const impliesClose = openImpliesClose.get(name)
|
||||||
|
|
||||||
if (impliesClose) {
|
if (impliesClose) {
|
||||||
while (this.stack.length > 0 && impliesClose.has(this.stack[0])) {
|
while (this.stack.length > 0 && impliesClose.has(this.stack[0])) {
|
||||||
|
@ -297,12 +283,10 @@ export class Parser implements Callbacks {
|
||||||
if (!this.isVoidElement(name)) {
|
if (!this.isVoidElement(name)) {
|
||||||
this.stack.unshift(name)
|
this.stack.unshift(name)
|
||||||
|
|
||||||
if (this.htmlMode) {
|
if (foreignContextElements.has(name)) {
|
||||||
if (foreignContextElements.has(name)) {
|
this.foreignContext.unshift(true)
|
||||||
this.foreignContext.unshift(true)
|
} else if (htmlIntegrationElements.has(name)) {
|
||||||
} else if (htmlIntegrationElements.has(name)) {
|
this.foreignContext.unshift(false)
|
||||||
this.foreignContext.unshift(false)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.cbs.onopentagname?.(name)
|
this.cbs.onopentagname?.(name)
|
||||||
|
@ -342,10 +326,7 @@ export class Parser implements Callbacks {
|
||||||
name = name.toLowerCase()
|
name = name.toLowerCase()
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (foreignContextElements.has(name) || htmlIntegrationElements.has(name)) {
|
||||||
this.htmlMode &&
|
|
||||||
(foreignContextElements.has(name) || htmlIntegrationElements.has(name))
|
|
||||||
) {
|
|
||||||
this.foreignContext.shift()
|
this.foreignContext.shift()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -357,12 +338,12 @@ export class Parser implements Callbacks {
|
||||||
// We know the stack has sufficient elements.
|
// We know the stack has sufficient elements.
|
||||||
this.cbs.onclosetag?.(element, index !== pos)
|
this.cbs.onclosetag?.(element, index !== pos)
|
||||||
}
|
}
|
||||||
} else if (this.htmlMode && name === 'p') {
|
} else if (name === 'p') {
|
||||||
// Implicit open before close
|
// Implicit open before close
|
||||||
this.emitOpenTag('p')
|
this.emitOpenTag('p')
|
||||||
this.closeCurrentTag(true)
|
this.closeCurrentTag(true)
|
||||||
}
|
}
|
||||||
} else if (this.htmlMode && name === 'br') {
|
} else if (name === 'br') {
|
||||||
// We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
|
// We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
|
||||||
this.cbs.onopentagname?.('br')
|
this.cbs.onopentagname?.('br')
|
||||||
this.cbs.onopentag?.('br', {}, true)
|
this.cbs.onopentag?.('br', {}, true)
|
||||||
|
@ -497,7 +478,7 @@ export class Parser implements Callbacks {
|
||||||
this.endIndex = endIndex
|
this.endIndex = endIndex
|
||||||
const value = this.getSlice(start, endIndex - offset)
|
const value = this.getSlice(start, endIndex - offset)
|
||||||
|
|
||||||
if (!this.htmlMode || this.options.recognizeCDATA) {
|
if (this.options.recognizeCDATA) {
|
||||||
this.cbs.oncdatastart?.()
|
this.cbs.oncdatastart?.()
|
||||||
this.cbs.ontext?.(value)
|
this.cbs.ontext?.(value)
|
||||||
this.cbs.oncdataend?.()
|
this.cbs.oncdataend?.()
|
||||||
|
@ -537,7 +518,7 @@ export class Parser implements Callbacks {
|
||||||
this.cbs.onparserinit?.(this)
|
this.cbs.onparserinit?.(this)
|
||||||
this.buffers.length = 0
|
this.buffers.length = 0
|
||||||
this.foreignContext.length = 0
|
this.foreignContext.length = 0
|
||||||
this.foreignContext.unshift(!this.htmlMode)
|
this.foreignContext.unshift(false)
|
||||||
this.bufferOffset = 0
|
this.bufferOffset = 0
|
||||||
this.writeIndex = 0
|
this.writeIndex = 0
|
||||||
this.ended = false
|
this.ended = false
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
import {
|
import {
|
||||||
EntityDecoder,
|
EntityDecoder,
|
||||||
DecodingMode,
|
DecodingMode,
|
||||||
htmlDecodeTree,
|
htmlDecodeTree
|
||||||
xmlDecodeTree
|
|
||||||
} from 'entities/lib/decode.js'
|
} from 'entities/lib/decode.js'
|
||||||
|
|
||||||
const enum CharCodes {
|
const enum CharCodes {
|
||||||
|
@ -89,13 +88,6 @@ function isEndOfTagSection(c: number): boolean {
|
||||||
return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c)
|
return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
function isASCIIAlpha(c: number): boolean {
|
|
||||||
return (
|
|
||||||
(c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
|
|
||||||
(c >= CharCodes.UpperA && c <= CharCodes.UpperZ)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
export enum QuoteType {
|
export enum QuoteType {
|
||||||
NoValue = 0,
|
NoValue = 0,
|
||||||
Unquoted = 1,
|
Unquoted = 1,
|
||||||
|
@ -156,22 +148,16 @@ export default class Tokenizer {
|
||||||
/** The offset of the current buffer. */
|
/** The offset of the current buffer. */
|
||||||
private offset = 0
|
private offset = 0
|
||||||
|
|
||||||
private readonly xmlMode: boolean
|
|
||||||
private readonly decodeEntities: boolean
|
private readonly decodeEntities: boolean
|
||||||
private readonly entityDecoder: EntityDecoder
|
private readonly entityDecoder: EntityDecoder
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
{
|
{ decodeEntities = true }: { decodeEntities?: boolean },
|
||||||
xmlMode = false,
|
|
||||||
decodeEntities = true
|
|
||||||
}: { xmlMode?: boolean; decodeEntities?: boolean },
|
|
||||||
private readonly cbs: Callbacks
|
private readonly cbs: Callbacks
|
||||||
) {
|
) {
|
||||||
this.xmlMode = xmlMode
|
|
||||||
this.decodeEntities = decodeEntities
|
this.decodeEntities = decodeEntities
|
||||||
this.entityDecoder = new EntityDecoder(
|
this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
|
||||||
xmlMode ? xmlDecodeTree : htmlDecodeTree,
|
this.emitCodePoint(cp, consumed)
|
||||||
(cp, consumed) => this.emitCodePoint(cp, consumed)
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -358,12 +344,12 @@ export default class Tokenizer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
|
* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
|
||||||
*
|
|
||||||
* XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar).
|
|
||||||
* We allow anything that wouldn't end the tag.
|
|
||||||
*/
|
*/
|
||||||
private isTagStartChar(c: number) {
|
private isTagStartChar(c: number) {
|
||||||
return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c)
|
return (
|
||||||
|
(c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
|
||||||
|
(c >= CharCodes.UpperA && c <= CharCodes.UpperZ)
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
private startSpecial(sequence: Uint8Array, offset: number) {
|
private startSpecial(sequence: Uint8Array, offset: number) {
|
||||||
|
@ -383,11 +369,11 @@ export default class Tokenizer {
|
||||||
} else if (this.isTagStartChar(c)) {
|
} else if (this.isTagStartChar(c)) {
|
||||||
const lower = c | 0x20
|
const lower = c | 0x20
|
||||||
this.sectionStart = this.index
|
this.sectionStart = this.index
|
||||||
if (!this.xmlMode && lower === Sequences.TitleEnd[2]) {
|
if (lower === Sequences.TitleEnd[2]) {
|
||||||
this.startSpecial(Sequences.TitleEnd, 3)
|
this.startSpecial(Sequences.TitleEnd, 3)
|
||||||
} else {
|
} else {
|
||||||
this.state =
|
this.state =
|
||||||
!this.xmlMode && lower === Sequences.ScriptEnd[2]
|
lower === Sequences.ScriptEnd[2]
|
||||||
? State.BeforeSpecialS
|
? State.BeforeSpecialS
|
||||||
: State.InTagName
|
: State.InTagName
|
||||||
}
|
}
|
||||||
|
@ -584,9 +570,7 @@ export default class Tokenizer {
|
||||||
this.state = State.InEntity
|
this.state = State.InEntity
|
||||||
this.entityStart = this.index
|
this.entityStart = this.index
|
||||||
this.entityDecoder.startEntity(
|
this.entityDecoder.startEntity(
|
||||||
this.xmlMode
|
this.baseState === State.Text || this.baseState === State.InSpecialTag
|
||||||
? DecodingMode.Strict
|
|
||||||
: this.baseState === State.Text || this.baseState === State.InSpecialTag
|
|
||||||
? DecodingMode.Legacy
|
? DecodingMode.Legacy
|
||||||
: DecodingMode.Attribute
|
: DecodingMode.Attribute
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue