refactor(parser): paragraph, table and fence

2022-05-29 10:45:57 +08:00 · 2022-05-29 10:45:57 +08:00 · 7df26af941
parent 0e3561add3
commit 7df26af941
9 changed files with 262 additions and 195 deletions
--- a/packages/parser/README.md
+++ b/packages/parser/README.md
@ -1,6 +1,6 @@
 # @md-report/parser
-Transfer plain markdown text to md-report data structure.
+Transfer plain markdown text to markdown tokens.
 ## License
--- a/packages/parser/package.json
+++ b/packages/parser/package.json
@ -20,6 +20,7 @@
    "prepublishOnly": "nr build"
  },
  "dependencies": {
    "docx": "^7.3.0",
    "js-yaml": "^4.1.0"
  }
 }
--- a/packages/parser/src/constants.ts
+++ b/packages/parser/src/constants.ts
@ -1,9 +0,0 @@
 // MarkdownIt token types.
 export const HEADING_OPEN = 'heading_open'
 export const PARAGRAPH_OPEN = 'paragraph_open'
 // Style
 export const KAI_TI_FIRA_CODE_FONTS = {
  ascii: 'Fira Code',
  eastAsia: 'KaiTi',
 }
--- a/packages/parser/src/core.ts
+++ b/packages/parser/src/core.ts
@ -1,128 +0,0 @@
 import YAML = require('js-yaml')
 import { isObject } from '@antfu/utils'
 import MarkdownIt = require('markdown-it')
 import type { ReportConfig, ReportMarkdown, ReportMarkdownParagraph, ReportMarkdownParagraphChild, ReportMarkdownSection } from '@md-report/types'
 import Token = require('markdown-it/lib/token')
 import type { IRunOptions } from 'docx'
 import { HEADING_OPEN, PARAGRAPH_OPEN } from './constants'
 import { getParagraphChildConfig, getParagraphChildType } from './utils'
 const md = MarkdownIt({ html: true })
 export function matter(code: string): { data: ReportConfig; content: string } {
  let data: any = {}
  const content = code.replace(/^---.*\r?\n([\s\S]*?)---/,
    (_, d) => {
      data = YAML.load(d)
      if (!isObject(data))
        data = {}
      return ''
    })
  return { data, content }
 }
 export function parseParagraphChild(tokens: Token[]): ReportMarkdownParagraphChild {
  // Get rid of closing tags.
  let i = 0
  while (tokens[i].type.includes('close') || tokens[i].content.match(/\<\/[^]*?\>/))
    i++
  const type = getParagraphChildType(tokens[i])
  const config: IRunOptions = getParagraphChildConfig(tokens.slice(i))
  return {
    type,
    config,
  }
 }
 export function parseParagraph(tokens: Token[]): ReportMarkdownParagraph {
  const type = tokens[0].type === HEADING_OPEN ? 'heading' : 'paragraph'
  const level = tokens[0].markup.length
  const _content = tokens[1].children
  const children: ReportMarkdownParagraphChild[] = []
  let start = 0
  for (let i = 0; i < _content.length; i++) {
    const _token = _content[i]
    if (_token.type === 'code_inline' || _token.type === 'text') {
      children.push(parseParagraphChild(_content.slice(start, i + 1)))
      start = i + 1
    }
  }
  return {
    type,
    level,
    children,
  }
 }
 export function parseSection(tokens: Token[]): ReportMarkdownSection {
  const children: ReportMarkdownParagraph[] = []
  let start = 0
  for (let i = 0; i < tokens.length; i++) {
    const token = tokens[i]
    if ((token.type === HEADING_OPEN || token.type === PARAGRAPH_OPEN) && i > start) {
      children.push(parseParagraph(tokens.slice(start, i)))
      start = i
    }
  }
  children.push(parseParagraph(tokens.slice(start)))
  return {
    type: 'section',
    children,
  }
 }
 export function parseContent(tokens: Token[]): ReportMarkdownSection[] {
  const sections: ReportMarkdownSection[] = []
  let start = 0
  for (let i = 0; i < tokens.length; i++) {
    const token = tokens[i]
    // If heading 1.
    if (token.type === HEADING_OPEN && token.markup.length === 1 && i > start) {
      sections.push(parseSection(tokens.slice(start, i)))
      start = i
    }
  }
  sections.push(parseSection(tokens.slice(start)))
  return sections
 }
 export function parse(markdown: string): ReportMarkdown {
  const { data: frontmatter, content: rawContent } = matter(markdown)
  const contentTokens = md.parse(rawContent, {})
  const content = parseContent(contentTokens)
  return {
    raw: markdown,
    frontmatter,
    content,
  }
 }
 const src = `# 111
 this is a paragraph with **\`strong\`**, *italic*, \`inline code\`, [hyperlink](https://syy11.cn), ~~delete~~, ==highlight==, $1 + 1 = 2$, a<sup>sup</sup><sub>sub</sub>, ![image](https://image.cn), footnotes[^foot][^note]
 \`\`\`javascript
 const a = 0
 \`\`\`
 $$
 1+2=3
 $$
 # Refs
 [^foot]: ref1
 [^note]: ref2`
 // eslint-disable-next-line no-console
 console.log(parse(src).content[0].children[1].children)
--- a/packages/parser/src/fs.ts
+++ b/packages/parser/src/fs.ts
@ -1,8 +0,0 @@
 import { promises as fs } from 'fs'
 import type { ReportMarkdown } from '@md-report/types'
 import { parse } from './core'
 export async function load(filepath: string, content?: string): Promise<ReportMarkdown> {
  const markdown = content ?? await fs.readFile(filepath, 'utf8')
  return parse(markdown)
 }
--- a/packages/parser/src/index.ts
+++ b/packages/parser/src/index.ts
@ -1,2 +1,49 @@
-export * from './core'
+import type Token from 'markdown-it/lib/token'
-export * from './fs'
+import MarkdownIt from 'markdown-it'
 import type { ISectionOptions, IStylesOptions, Paragraph, Table, TableOfContents } from 'docx'
 import { Document } from 'docx'
 import { sliceParagraph, sliceSection } from './utils'
 import { paragraphParser } from './paragraph'
 const md = new MarkdownIt()
 export function parse(props: { markdown: string; config: { meta: Record<string, any>; styles: IStylesOptions } }): Document {
  const { markdown, config } = props
  const { meta, styles } = config
  // Get frontmatter.
  // Get tokens.
  const tokens: Token[] = md.parse(markdown, meta)
  return parseDocument(tokens, styles)
 }
 export function parseDocument(tokens: Token[], styles: IStylesOptions): Document {
  // Variables.
  let pos = 0
  const sections: ISectionOptions[] = []
  // Split and parse sections.
  while (pos < tokens.length) {
    const { tokens: section, offset: nextPos } = sliceSection(tokens.slice(pos))
    sections.push(parseSection(section))
    pos = nextPos
  }
  return new Document({
    styles,
    sections,
  })
 }
 export function parseSection(tokens: Token[]): ISectionOptions {
  // Variables.
  let pos = 0
  const children: (Paragraph | Table | TableOfContents)[] = []
  // Split and parse paragraphs.
  while (pos < tokens.length) {
    const { tokens: paragraph, offset: nextPos } = sliceParagraph(tokens.slice(pos))
    const parser = paragraphParser[tokens[0].tag]
    children.push(parser(paragraph))
    pos = nextPos
  }
  return {
    children,
  }
 }
--- a/packages/parser/src/inline.ts
+++ b/packages/parser/src/inline.ts
@ -0,0 +1,94 @@
 import { readFileSync } from 'fs'
 import type { IImageOptions, IRunOptions, ParagraphChild } from 'docx'
 import { ImageRun, Paragraph, TextRun } from 'docx'
 import type Token from 'markdown-it/lib/token'
 import { sliceInlineText } from './utils'
 export function parseInline(props: { tokens: Token[]; style?: string }): Paragraph {
  // Variables.
  const { tokens, style = 'normal' } = props
  const { children: childrenTokens } = tokens[0]
  const { length } = childrenTokens || []
  const children: ParagraphChild[] = []
  let pos = 0
  // Parse inline children.
  while (pos < length) {
    const { tokens: paragraphChild, offset: nextPos } = sliceInlineText(tokens.slice(pos))
    if (tokens[0].tag === 'img')
      children.push(parseImage(paragraphChild))
    else
      children.push(parseText(paragraphChild))
    pos = nextPos
  }
  return new Paragraph({
    style,
    children,
  })
 }
 export function parseText(tokens: Token[]): TextRun {
  let options: IRunOptions = {}
  tokens.forEach((token) => {
    if (token.nesting >= 0) {
      // Only deal with opening and text/code tokens.
      switch (token.tag) {
        // Bold.
        case 'strong':
          options = { ...options, bold: true }
          break
        // Italics
        case 'em':
          options = { ...options, italics: true }
          break
        // Subscript.
        case 'sub':
          options = { ...options, subScript: true }
          break
        // Superscript.
        case 'sup':
          options = { ...options, superScript: true }
          break
        // Strikethrough.
        case 's':
          options = { ...options, strike: true }
          break
        // Highlight.
        case 'mark':
          // TODO: Replace highlight color with env data.
          options = { ...options, highlight: 'yellow' }
          break
        // Inline code.
        case 'code':
          // TODO: Replace code font with env data.
          options = { ...options, font: {}, text: token.content }
          break
        // Normal text.
        default:
          options = { ...options, text: token.content }
      }
    }
  })
  return new TextRun(options)
 }
 export function parseImage(tokens: Token[]): ImageRun | TextRun {
  const { attrGet, content } = tokens[0]
  const src = attrGet('src')
  if (!src) {
    return new TextRun({
      text: `[MD Report]: Image ${content} is not found.`,
      bold: true,
      color: 'red',
      highlight: 'yellow',
    })
  }
  const options: IImageOptions = {
    data: readFileSync(src).toString('base64'),
    // TODO: Replace width and height with config in image url.
    transformation: {
      width: 100,
      height: 100,
    },
  }
  return new ImageRun(options)
 }
--- a/packages/parser/src/paragraph.ts
+++ b/packages/parser/src/paragraph.ts
@ -0,0 +1,72 @@
 import type Token from 'markdown-it/lib/token'
 import { Paragraph, Table, TableCell, TableRow } from 'docx'
 import { sliceTableRow } from './utils'
 import { parseInline } from './inline'
 export function parseFence(tokens: Token[]): Paragraph {
  // Variables.
  const { content: text } = tokens[0]
  return new Paragraph({
    style: 'fence',
    text,
  })
 }
 export function parseTable(tokens: Token[]): Table {
  // Variables
  let pos = 0
  const rows: TableRow[] = []
  while (pos < tokens.length) {
    const { tokens: tableRow, offset: nextPos } = sliceTableRow(tokens.slice(pos))
    rows.push(parseTableRow(tableRow))
    pos = nextPos
  }
  return new Table({
    style: 'table',
    rows,
  })
 }
 export function parseTableRow(tokens: Token[]): TableRow {
  const cells: Token[] = tokens.filter(token => token.type === 'inline')
  const children: TableCell[] = cells.map(cell => new TableCell({
    children: [parseInline({
      tokens: [cell],
      style: 'table',
    })],
  }))
  return new TableRow({
    children,
  })
 }
 export function parseParagraph(tokens: Token[]): Paragraph {
  const inline = tokens.filter(token => token.type === 'inline')
  return parseInline({
    tokens: inline,
    style: 'normal',
  })
 }
 export function parseHeading(tokens: Token[]): Paragraph {
  // Inline token.
  const inline = tokens.filter(token => token.type === 'inline')
  // Heading level.
  const { length } = tokens[0].markup
  return parseInline({
    tokens: inline,
    style: `heading${length}`,
  })
 }
 export const paragraphParser: Record<string, (tokens: Token[]) => (Paragraph|Table)> = {
  code: parseFence,
  table: parseTable,
  p: parseParagraph,
  h1: parseHeading,
  h2: parseHeading,
  h3: parseHeading,
  h4: parseHeading,
  h5: parseHeading,
  h6: parseHeading,
 }
--- a/packages/parser/src/utils.ts
+++ b/packages/parser/src/utils.ts
@ -1,55 +1,53 @@
-import type { IRunOptions } from 'docx'
+import type Token from 'markdown-it/lib/token'
 import type { MarkdownItTokenType } from '@md-report/types'
 import Token = require('markdown-it/lib/token')
 import { KAI_TI_FIRA_CODE_FONTS } from './constants'
-export function getParagraphChildType(token: Token): 'image' | 'text' {
+export interface SliceResult {
-  switch (token.type) {
+  tokens: Token[]
-    case 'image':
+  offset: number
-      return 'image'
+}
-    default:
+
-      return 'text'
+export function sliceSection(tokens: Token[]): SliceResult {
  let offset = 0
  if (tokens[0].tag === 'h1') {
    while (tokens[offset].nesting >= 0 || tokens[offset].tag !== 'h1')
      offset++
  }
  return {
    tokens: tokens.slice(0, offset + 1),
    offset: offset + 1,
  }
 }
-export function getParagraphChildConfig(tokens: Token[]): IRunOptions {
+export function sliceParagraph(tokens: Token[]): SliceResult {
-  let config: IRunOptions = {}
+  let offset = 0
-
+  // Code block.
-  for (let i = 0; i < tokens.length; i++) {
+  if (tokens[0].type !== 'fence') {
-    const token = tokens[i]
+    // Normal paragraphs.
-    switch (token.type as MarkdownItTokenType) {
+    while (tokens[offset].level > 0 || tokens[offset].nesting >= 0)
-      case 'em_open': {
+      offset++
        config = { ...config, italics: true }
        break
      }
      case 'strong_open': {
        config = { ...config, bold: true }
        break
      }
      case 'mark_open': {
        config = { ...config, shading: { fill: '#bbbbbb' }, style: 'mark' }
        break
      }
      case 'html_inline': {
        if (token.content === '<sup>')
          config = { ...config, superScript: true }
        if (token.content === '<sub>')
          config = { ...config, subScript: true }
        break
      }
      case 's_open': {
        config = { ...config, strike: true }
        break
      }
      case 'code_inline': {
        config = { ...config, font: KAI_TI_FIRA_CODE_FONTS, style: 'code' }
        break
      }
      case 'text': {
        config = { ...config, text: token.content }
  }
  // Return paragraph tokens.
  return {
    tokens: tokens.slice(0, offset + 1),
    offset: offset + 1,
  }
 }
-  return config
+export function sliceTableRow(tokens: Token[]): SliceResult {
  let offset = 0
  while (tokens[offset].type !== 'tr_open')
    offset++
  return {
    tokens: tokens.slice(0, offset),
    offset,
  }
 }
 export function sliceInlineText(tokens: Token[]): SliceResult {
  if (tokens[0].tag === 'img' || tokens[0].tag === 'code') {
    return {
      tokens: tokens.slice(0, 1),
      offset: 1,
    }
  }
  return sliceParagraph(tokens)
 }