refactor(parser): paragraph, table and fence

This commit is contained in:
syy11cn 2022-05-29 10:45:57 +08:00
parent 0e3561add3
commit 7df26af941
9 changed files with 262 additions and 195 deletions

View File

@ -1,6 +1,6 @@
# @md-report/parser
Transfer plain markdown text to md-report data structure.
Transfer plain markdown text to markdown tokens.
## License

View File

@ -20,6 +20,7 @@
"prepublishOnly": "nr build"
"dependencies": {
"docx": "^7.3.0",
"js-yaml": "^4.1.0"

View File

@ -1,9 +0,0 @@
// MarkdownIt token types.
export const HEADING_OPEN = 'heading_open'
export const PARAGRAPH_OPEN = 'paragraph_open'
// Style
export const KAI_TI_FIRA_CODE_FONTS = {
ascii: 'Fira Code',
eastAsia: 'KaiTi',

View File

@ -1,128 +0,0 @@
import YAML = require('js-yaml')
import { isObject } from '@antfu/utils'
import MarkdownIt = require('markdown-it')
import type { ReportConfig, ReportMarkdown, ReportMarkdownParagraph, ReportMarkdownParagraphChild, ReportMarkdownSection } from '@md-report/types'
import Token = require('markdown-it/lib/token')
import type { IRunOptions } from 'docx'
import { HEADING_OPEN, PARAGRAPH_OPEN } from './constants'
import { getParagraphChildConfig, getParagraphChildType } from './utils'
const md = MarkdownIt({ html: true })
export function matter(code: string): { data: ReportConfig; content: string } {
let data: any = {}
const content = code.replace(/^---.*\r?\n([\s\S]*?)---/,
(_, d) => {
data = YAML.load(d)
if (!isObject(data))
data = {}
return ''
return { data, content }
export function parseParagraphChild(tokens: Token[]): ReportMarkdownParagraphChild {
// Get rid of closing tags.
let i = 0
while (tokens[i].type.includes('close') || tokens[i].content.match(/\<\/[^]*?\>/))
const type = getParagraphChildType(tokens[i])
const config: IRunOptions = getParagraphChildConfig(tokens.slice(i))
return {
export function parseParagraph(tokens: Token[]): ReportMarkdownParagraph {
const type = tokens[0].type === HEADING_OPEN ? 'heading' : 'paragraph'
const level = tokens[0].markup.length
const _content = tokens[1].children
const children: ReportMarkdownParagraphChild[] = []
let start = 0
for (let i = 0; i < _content.length; i++) {
const _token = _content[i]
if (_token.type === 'code_inline' || _token.type === 'text') {
children.push(parseParagraphChild(_content.slice(start, i + 1)))
start = i + 1
return {
export function parseSection(tokens: Token[]): ReportMarkdownSection {
const children: ReportMarkdownParagraph[] = []
let start = 0
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]
if ((token.type === HEADING_OPEN || token.type === PARAGRAPH_OPEN) && i > start) {
children.push(parseParagraph(tokens.slice(start, i)))
start = i
return {
type: 'section',
export function parseContent(tokens: Token[]): ReportMarkdownSection[] {
const sections: ReportMarkdownSection[] = []
let start = 0
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]
// If heading 1.
if (token.type === HEADING_OPEN && token.markup.length === 1 && i > start) {
sections.push(parseSection(tokens.slice(start, i)))
start = i
return sections
export function parse(markdown: string): ReportMarkdown {
const { data: frontmatter, content: rawContent } = matter(markdown)
const contentTokens = md.parse(rawContent, {})
const content = parseContent(contentTokens)
return {
raw: markdown,
const src = `# 111
this is a paragraph with **\`strong\`**, *italic*, \`inline code\`, [hyperlink](, ~~delete~~, ==highlight==, $1 + 1 = 2$, a<sup>sup</sup><sub>sub</sub>, ![image](, footnotes[^foot][^note]
const a = 0
# Refs
[^foot]: ref1
[^note]: ref2`
// eslint-disable-next-line no-console

View File

@ -1,8 +0,0 @@
import { promises as fs } from 'fs'
import type { ReportMarkdown } from '@md-report/types'
import { parse } from './core'
export async function load(filepath: string, content?: string): Promise<ReportMarkdown> {
const markdown = content ?? await fs.readFile(filepath, 'utf8')
return parse(markdown)

View File

@ -1,2 +1,49 @@
export * from './core'
export * from './fs'
import type Token from 'markdown-it/lib/token'
import MarkdownIt from 'markdown-it'
import type { ISectionOptions, IStylesOptions, Paragraph, Table, TableOfContents } from 'docx'
import { Document } from 'docx'
import { sliceParagraph, sliceSection } from './utils'
import { paragraphParser } from './paragraph'
const md = new MarkdownIt()
export function parse(props: { markdown: string; config: { meta: Record<string, any>; styles: IStylesOptions } }): Document {
const { markdown, config } = props
const { meta, styles } = config
// Get frontmatter.
// Get tokens.
const tokens: Token[] = md.parse(markdown, meta)
return parseDocument(tokens, styles)
export function parseDocument(tokens: Token[], styles: IStylesOptions): Document {
// Variables.
let pos = 0
const sections: ISectionOptions[] = []
// Split and parse sections.
while (pos < tokens.length) {
const { tokens: section, offset: nextPos } = sliceSection(tokens.slice(pos))
pos = nextPos
return new Document({
export function parseSection(tokens: Token[]): ISectionOptions {
// Variables.
let pos = 0
const children: (Paragraph | Table | TableOfContents)[] = []
// Split and parse paragraphs.
while (pos < tokens.length) {
const { tokens: paragraph, offset: nextPos } = sliceParagraph(tokens.slice(pos))
const parser = paragraphParser[tokens[0].tag]
pos = nextPos
return {

View File

@ -0,0 +1,94 @@
import { readFileSync } from 'fs'
import type { IImageOptions, IRunOptions, ParagraphChild } from 'docx'
import { ImageRun, Paragraph, TextRun } from 'docx'
import type Token from 'markdown-it/lib/token'
import { sliceInlineText } from './utils'
export function parseInline(props: { tokens: Token[]; style?: string }): Paragraph {
// Variables.
const { tokens, style = 'normal' } = props
const { children: childrenTokens } = tokens[0]
const { length } = childrenTokens || []
const children: ParagraphChild[] = []
let pos = 0
// Parse inline children.
while (pos < length) {
const { tokens: paragraphChild, offset: nextPos } = sliceInlineText(tokens.slice(pos))
if (tokens[0].tag === 'img')
pos = nextPos
return new Paragraph({
export function parseText(tokens: Token[]): TextRun {
let options: IRunOptions = {}
tokens.forEach((token) => {
if (token.nesting >= 0) {
// Only deal with opening and text/code tokens.
switch (token.tag) {
// Bold.
case 'strong':
options = { ...options, bold: true }
// Italics
case 'em':
options = { ...options, italics: true }
// Subscript.
case 'sub':
options = { ...options, subScript: true }
// Superscript.
case 'sup':
options = { ...options, superScript: true }
// Strikethrough.
case 's':
options = { ...options, strike: true }
// Highlight.
case 'mark':
// TODO: Replace highlight color with env data.
options = { ...options, highlight: 'yellow' }
// Inline code.
case 'code':
// TODO: Replace code font with env data.
options = { ...options, font: {}, text: token.content }
// Normal text.
options = { ...options, text: token.content }
return new TextRun(options)
export function parseImage(tokens: Token[]): ImageRun | TextRun {
const { attrGet, content } = tokens[0]
const src = attrGet('src')
if (!src) {
return new TextRun({
text: `[MD Report]: Image ${content} is not found.`,
bold: true,
color: 'red',
highlight: 'yellow',
const options: IImageOptions = {
data: readFileSync(src).toString('base64'),
// TODO: Replace width and height with config in image url.
transformation: {
width: 100,
height: 100,
return new ImageRun(options)

View File

@ -0,0 +1,72 @@
import type Token from 'markdown-it/lib/token'
import { Paragraph, Table, TableCell, TableRow } from 'docx'
import { sliceTableRow } from './utils'
import { parseInline } from './inline'
export function parseFence(tokens: Token[]): Paragraph {
// Variables.
const { content: text } = tokens[0]
return new Paragraph({
style: 'fence',
export function parseTable(tokens: Token[]): Table {
// Variables
let pos = 0
const rows: TableRow[] = []
while (pos < tokens.length) {
const { tokens: tableRow, offset: nextPos } = sliceTableRow(tokens.slice(pos))
pos = nextPos
return new Table({
style: 'table',
export function parseTableRow(tokens: Token[]): TableRow {
const cells: Token[] = tokens.filter(token => token.type === 'inline')
const children: TableCell[] = => new TableCell({
children: [parseInline({
tokens: [cell],
style: 'table',
return new TableRow({
export function parseParagraph(tokens: Token[]): Paragraph {
const inline = tokens.filter(token => token.type === 'inline')
return parseInline({
tokens: inline,
style: 'normal',
export function parseHeading(tokens: Token[]): Paragraph {
// Inline token.
const inline = tokens.filter(token => token.type === 'inline')
// Heading level.
const { length } = tokens[0].markup
return parseInline({
tokens: inline,
style: `heading${length}`,
export const paragraphParser: Record<string, (tokens: Token[]) => (Paragraph|Table)> = {
code: parseFence,
table: parseTable,
p: parseParagraph,
h1: parseHeading,
h2: parseHeading,
h3: parseHeading,
h4: parseHeading,
h5: parseHeading,
h6: parseHeading,

View File

@ -1,55 +1,53 @@
import type { IRunOptions } from 'docx'
import type { MarkdownItTokenType } from '@md-report/types'
import Token = require('markdown-it/lib/token')
import { KAI_TI_FIRA_CODE_FONTS } from './constants'
import type Token from 'markdown-it/lib/token'
export function getParagraphChildType(token: Token): 'image' | 'text' {
switch (token.type) {
case 'image':
return 'image'
return 'text'
export interface SliceResult {
tokens: Token[]
offset: number
export function sliceSection(tokens: Token[]): SliceResult {
let offset = 0
if (tokens[0].tag === 'h1') {
while (tokens[offset].nesting >= 0 || tokens[offset].tag !== 'h1')
return {
tokens: tokens.slice(0, offset + 1),
offset: offset + 1,
export function getParagraphChildConfig(tokens: Token[]): IRunOptions {
let config: IRunOptions = {}
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]
switch (token.type as MarkdownItTokenType) {
case 'em_open': {
config = { ...config, italics: true }
case 'strong_open': {
config = { ...config, bold: true }
case 'mark_open': {
config = { ...config, shading: { fill: '#bbbbbb' }, style: 'mark' }
case 'html_inline': {
if (token.content === '<sup>')
config = { ...config, superScript: true }
if (token.content === '<sub>')
config = { ...config, subScript: true }
case 's_open': {
config = { ...config, strike: true }
case 'code_inline': {
config = { ...config, font: KAI_TI_FIRA_CODE_FONTS, style: 'code' }
case 'text': {
config = { ...config, text: token.content }
export function sliceParagraph(tokens: Token[]): SliceResult {
let offset = 0
// Code block.
if (tokens[0].type !== 'fence') {
// Normal paragraphs.
while (tokens[offset].level > 0 || tokens[offset].nesting >= 0)
// Return paragraph tokens.
return {
tokens: tokens.slice(0, offset + 1),
offset: offset + 1,
return config
export function sliceTableRow(tokens: Token[]): SliceResult {
let offset = 0
while (tokens[offset].type !== 'tr_open')
return {
tokens: tokens.slice(0, offset),
export function sliceInlineText(tokens: Token[]): SliceResult {
if (tokens[0].tag === 'img' || tokens[0].tag === 'code') {
return {
tokens: tokens.slice(0, 1),
offset: 1,
return sliceParagraph(tokens)