refactor(parser): paragraph, table and fence

This commit is contained in:
syy11cn 2022-05-29 10:45:57 +08:00
parent 0e3561add3
commit 7df26af941
9 changed files with 262 additions and 195 deletions

View File

@ -1,6 +1,6 @@
# @md-report/parser
Transfer plain markdown text to md-report data structure.
Transfer plain markdown text to markdown tokens.
## License

View File

@ -20,6 +20,7 @@
"prepublishOnly": "nr build"
},
"dependencies": {
"docx": "^7.3.0",
"js-yaml": "^4.1.0"
}
}

View File

@ -1,9 +0,0 @@
// MarkdownIt token types.
export const HEADING_OPEN = 'heading_open'
export const PARAGRAPH_OPEN = 'paragraph_open'
// Style
export const KAI_TI_FIRA_CODE_FONTS = {
ascii: 'Fira Code',
eastAsia: 'KaiTi',
}

View File

@ -1,128 +0,0 @@
import YAML = require('js-yaml')
import { isObject } from '@antfu/utils'
import MarkdownIt = require('markdown-it')
import type { ReportConfig, ReportMarkdown, ReportMarkdownParagraph, ReportMarkdownParagraphChild, ReportMarkdownSection } from '@md-report/types'
import Token = require('markdown-it/lib/token')
import type { IRunOptions } from 'docx'
import { HEADING_OPEN, PARAGRAPH_OPEN } from './constants'
import { getParagraphChildConfig, getParagraphChildType } from './utils'
const md = MarkdownIt({ html: true })
export function matter(code: string): { data: ReportConfig; content: string } {
let data: any = {}
const content = code.replace(/^---.*\r?\n([\s\S]*?)---/,
(_, d) => {
data = YAML.load(d)
if (!isObject(data))
data = {}
return ''
})
return { data, content }
}
export function parseParagraphChild(tokens: Token[]): ReportMarkdownParagraphChild {
// Get rid of closing tags.
let i = 0
while (tokens[i].type.includes('close') || tokens[i].content.match(/\<\/[^]*?\>/))
i++
const type = getParagraphChildType(tokens[i])
const config: IRunOptions = getParagraphChildConfig(tokens.slice(i))
return {
type,
config,
}
}
export function parseParagraph(tokens: Token[]): ReportMarkdownParagraph {
const type = tokens[0].type === HEADING_OPEN ? 'heading' : 'paragraph'
const level = tokens[0].markup.length
const _content = tokens[1].children
const children: ReportMarkdownParagraphChild[] = []
let start = 0
for (let i = 0; i < _content.length; i++) {
const _token = _content[i]
if (_token.type === 'code_inline' || _token.type === 'text') {
children.push(parseParagraphChild(_content.slice(start, i + 1)))
start = i + 1
}
}
return {
type,
level,
children,
}
}
export function parseSection(tokens: Token[]): ReportMarkdownSection {
const children: ReportMarkdownParagraph[] = []
let start = 0
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]
if ((token.type === HEADING_OPEN || token.type === PARAGRAPH_OPEN) && i > start) {
children.push(parseParagraph(tokens.slice(start, i)))
start = i
}
}
children.push(parseParagraph(tokens.slice(start)))
return {
type: 'section',
children,
}
}
export function parseContent(tokens: Token[]): ReportMarkdownSection[] {
const sections: ReportMarkdownSection[] = []
let start = 0
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]
// If heading 1.
if (token.type === HEADING_OPEN && token.markup.length === 1 && i > start) {
sections.push(parseSection(tokens.slice(start, i)))
start = i
}
}
sections.push(parseSection(tokens.slice(start)))
return sections
}
export function parse(markdown: string): ReportMarkdown {
const { data: frontmatter, content: rawContent } = matter(markdown)
const contentTokens = md.parse(rawContent, {})
const content = parseContent(contentTokens)
return {
raw: markdown,
frontmatter,
content,
}
}
const src = `# 111
this is a paragraph with **\`strong\`**, *italic*, \`inline code\`, [hyperlink](https://syy11.cn), ~~delete~~, ==highlight==, $1 + 1 = 2$, a<sup>sup</sup><sub>sub</sub>, ![image](https://image.cn), footnotes[^foot][^note]
\`\`\`javascript
const a = 0
\`\`\`
$$
1+2=3
$$
# Refs
[^foot]: ref1
[^note]: ref2`
// eslint-disable-next-line no-console
console.log(parse(src).content[0].children[1].children)

View File

@ -1,8 +0,0 @@
import { promises as fs } from 'fs'
import type { ReportMarkdown } from '@md-report/types'
import { parse } from './core'
export async function load(filepath: string, content?: string): Promise<ReportMarkdown> {
const markdown = content ?? await fs.readFile(filepath, 'utf8')
return parse(markdown)
}

View File

@ -1,2 +1,49 @@
export * from './core'
export * from './fs'
import type Token from 'markdown-it/lib/token'
import MarkdownIt from 'markdown-it'
import type { ISectionOptions, IStylesOptions, Paragraph, Table, TableOfContents } from 'docx'
import { Document } from 'docx'
import { sliceParagraph, sliceSection } from './utils'
import { paragraphParser } from './paragraph'
const md = new MarkdownIt()
export function parse(props: { markdown: string; config: { meta: Record<string, any>; styles: IStylesOptions } }): Document {
const { markdown, config } = props
const { meta, styles } = config
// Get frontmatter.
// Get tokens.
const tokens: Token[] = md.parse(markdown, meta)
return parseDocument(tokens, styles)
}
export function parseDocument(tokens: Token[], styles: IStylesOptions): Document {
// Variables.
let pos = 0
const sections: ISectionOptions[] = []
// Split and parse sections.
while (pos < tokens.length) {
const { tokens: section, offset: nextPos } = sliceSection(tokens.slice(pos))
sections.push(parseSection(section))
pos = nextPos
}
return new Document({
styles,
sections,
})
}
export function parseSection(tokens: Token[]): ISectionOptions {
// Variables.
let pos = 0
const children: (Paragraph | Table | TableOfContents)[] = []
// Split and parse paragraphs.
while (pos < tokens.length) {
const { tokens: paragraph, offset: nextPos } = sliceParagraph(tokens.slice(pos))
const parser = paragraphParser[tokens[0].tag]
children.push(parser(paragraph))
pos = nextPos
}
return {
children,
}
}

View File

@ -0,0 +1,94 @@
import { readFileSync } from 'fs'
import type { IImageOptions, IRunOptions, ParagraphChild } from 'docx'
import { ImageRun, Paragraph, TextRun } from 'docx'
import type Token from 'markdown-it/lib/token'
import { sliceInlineText } from './utils'
export function parseInline(props: { tokens: Token[]; style?: string }): Paragraph {
// Variables.
const { tokens, style = 'normal' } = props
const { children: childrenTokens } = tokens[0]
const { length } = childrenTokens || []
const children: ParagraphChild[] = []
let pos = 0
// Parse inline children.
while (pos < length) {
const { tokens: paragraphChild, offset: nextPos } = sliceInlineText(tokens.slice(pos))
if (tokens[0].tag === 'img')
children.push(parseImage(paragraphChild))
else
children.push(parseText(paragraphChild))
pos = nextPos
}
return new Paragraph({
style,
children,
})
}
export function parseText(tokens: Token[]): TextRun {
let options: IRunOptions = {}
tokens.forEach((token) => {
if (token.nesting >= 0) {
// Only deal with opening and text/code tokens.
switch (token.tag) {
// Bold.
case 'strong':
options = { ...options, bold: true }
break
// Italics
case 'em':
options = { ...options, italics: true }
break
// Subscript.
case 'sub':
options = { ...options, subScript: true }
break
// Superscript.
case 'sup':
options = { ...options, superScript: true }
break
// Strikethrough.
case 's':
options = { ...options, strike: true }
break
// Highlight.
case 'mark':
// TODO: Replace highlight color with env data.
options = { ...options, highlight: 'yellow' }
break
// Inline code.
case 'code':
// TODO: Replace code font with env data.
options = { ...options, font: {}, text: token.content }
break
// Normal text.
default:
options = { ...options, text: token.content }
}
}
})
return new TextRun(options)
}
export function parseImage(tokens: Token[]): ImageRun | TextRun {
const { attrGet, content } = tokens[0]
const src = attrGet('src')
if (!src) {
return new TextRun({
text: `[MD Report]: Image ${content} is not found.`,
bold: true,
color: 'red',
highlight: 'yellow',
})
}
const options: IImageOptions = {
data: readFileSync(src).toString('base64'),
// TODO: Replace width and height with config in image url.
transformation: {
width: 100,
height: 100,
},
}
return new ImageRun(options)
}

View File

@ -0,0 +1,72 @@
import type Token from 'markdown-it/lib/token'
import { Paragraph, Table, TableCell, TableRow } from 'docx'
import { sliceTableRow } from './utils'
import { parseInline } from './inline'
export function parseFence(tokens: Token[]): Paragraph {
// Variables.
const { content: text } = tokens[0]
return new Paragraph({
style: 'fence',
text,
})
}
export function parseTable(tokens: Token[]): Table {
// Variables
let pos = 0
const rows: TableRow[] = []
while (pos < tokens.length) {
const { tokens: tableRow, offset: nextPos } = sliceTableRow(tokens.slice(pos))
rows.push(parseTableRow(tableRow))
pos = nextPos
}
return new Table({
style: 'table',
rows,
})
}
export function parseTableRow(tokens: Token[]): TableRow {
const cells: Token[] = tokens.filter(token => token.type === 'inline')
const children: TableCell[] = cells.map(cell => new TableCell({
children: [parseInline({
tokens: [cell],
style: 'table',
})],
}))
return new TableRow({
children,
})
}
export function parseParagraph(tokens: Token[]): Paragraph {
const inline = tokens.filter(token => token.type === 'inline')
return parseInline({
tokens: inline,
style: 'normal',
})
}
export function parseHeading(tokens: Token[]): Paragraph {
// Inline token.
const inline = tokens.filter(token => token.type === 'inline')
// Heading level.
const { length } = tokens[0].markup
return parseInline({
tokens: inline,
style: `heading${length}`,
})
}
export const paragraphParser: Record<string, (tokens: Token[]) => (Paragraph|Table)> = {
code: parseFence,
table: parseTable,
p: parseParagraph,
h1: parseHeading,
h2: parseHeading,
h3: parseHeading,
h4: parseHeading,
h5: parseHeading,
h6: parseHeading,
}

View File

@ -1,55 +1,53 @@
import type { IRunOptions } from 'docx'
import type { MarkdownItTokenType } from '@md-report/types'
import Token = require('markdown-it/lib/token')
import { KAI_TI_FIRA_CODE_FONTS } from './constants'
import type Token from 'markdown-it/lib/token'
export function getParagraphChildType(token: Token): 'image' | 'text' {
switch (token.type) {
case 'image':
return 'image'
default:
return 'text'
export interface SliceResult {
tokens: Token[]
offset: number
}
export function sliceSection(tokens: Token[]): SliceResult {
let offset = 0
if (tokens[0].tag === 'h1') {
while (tokens[offset].nesting >= 0 || tokens[offset].tag !== 'h1')
offset++
}
return {
tokens: tokens.slice(0, offset + 1),
offset: offset + 1,
}
}
export function getParagraphChildConfig(tokens: Token[]): IRunOptions {
let config: IRunOptions = {}
export function sliceParagraph(tokens: Token[]): SliceResult {
let offset = 0
// Code block.
if (tokens[0].type !== 'fence') {
// Normal paragraphs.
while (tokens[offset].level > 0 || tokens[offset].nesting >= 0)
offset++
}
// Return paragraph tokens.
return {
tokens: tokens.slice(0, offset + 1),
offset: offset + 1,
}
}
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]
switch (token.type as MarkdownItTokenType) {
case 'em_open': {
config = { ...config, italics: true }
break
}
case 'strong_open': {
config = { ...config, bold: true }
break
}
case 'mark_open': {
config = { ...config, shading: { fill: '#bbbbbb' }, style: 'mark' }
break
}
case 'html_inline': {
if (token.content === '<sup>')
config = { ...config, superScript: true }
if (token.content === '<sub>')
config = { ...config, subScript: true }
break
}
case 's_open': {
config = { ...config, strike: true }
break
}
case 'code_inline': {
config = { ...config, font: KAI_TI_FIRA_CODE_FONTS, style: 'code' }
break
}
case 'text': {
config = { ...config, text: token.content }
}
export function sliceTableRow(tokens: Token[]): SliceResult {
let offset = 0
while (tokens[offset].type !== 'tr_open')
offset++
return {
tokens: tokens.slice(0, offset),
offset,
}
}
export function sliceInlineText(tokens: Token[]): SliceResult {
if (tokens[0].tag === 'img' || tokens[0].tag === 'code') {
return {
tokens: tokens.slice(0, 1),
offset: 1,
}
}
return config
return sliceParagraph(tokens)
}