refactor(parser): paragraph, table and fence

2022-05-29 10:45:57 +08:00 · 2022-05-29 10:45:57 +08:00 · 7df26af941
parent 0e3561add3
commit 7df26af941
9 changed files with 262 additions and 195 deletions
--- a/packages/parser/README.md
+++ b/packages/parser/README.md
@ -1,6 +1,6 @@
 # @md-report/parser

-Transfer plain markdown text to md-report data structure.
+Transfer plain markdown text to markdown tokens.

 ## License

--- a/packages/parser/package.json
+++ b/packages/parser/package.json
@ -20,6 +20,7 @@
    "prepublishOnly": "nr build"
  },
  "dependencies": {
+    "docx": "^7.3.0",
    "js-yaml": "^4.1.0"
  }
 }
--- a/packages/parser/src/constants.ts
+++ b/packages/parser/src/constants.ts
@ -1,9 +0,0 @@
-// MarkdownIt token types.
-export const HEADING_OPEN = 'heading_open'
-export const PARAGRAPH_OPEN = 'paragraph_open'
-
-// Style
-export const KAI_TI_FIRA_CODE_FONTS = {
-  ascii: 'Fira Code',
-  eastAsia: 'KaiTi',
-}
--- a/packages/parser/src/core.ts
+++ b/packages/parser/src/core.ts
@ -1,128 +0,0 @@
-import YAML = require('js-yaml')
-import { isObject } from '@antfu/utils'
-import MarkdownIt = require('markdown-it')
-import type { ReportConfig, ReportMarkdown, ReportMarkdownParagraph, ReportMarkdownParagraphChild, ReportMarkdownSection } from '@md-report/types'
-import Token = require('markdown-it/lib/token')
-import type { IRunOptions } from 'docx'
-import { HEADING_OPEN, PARAGRAPH_OPEN } from './constants'
-import { getParagraphChildConfig, getParagraphChildType } from './utils'
-
-const md = MarkdownIt({ html: true })
-
-export function matter(code: string): { data: ReportConfig; content: string } {
-  let data: any = {}
-  const content = code.replace(/^---.*\r?\n([\s\S]*?)---/,
-    (_, d) => {
-      data = YAML.load(d)
-      if (!isObject(data))
-        data = {}
-      return ''
-    })
-  return { data, content }
-}
-
-export function parseParagraphChild(tokens: Token[]): ReportMarkdownParagraphChild {
-  // Get rid of closing tags.
-  let i = 0
-  while (tokens[i].type.includes('close') || tokens[i].content.match(/\<\/[^]*?\>/))
-    i++
-
-  const type = getParagraphChildType(tokens[i])
-  const config: IRunOptions = getParagraphChildConfig(tokens.slice(i))
-
-  return {
-    type,
-    config,
-  }
-}
-
-export function parseParagraph(tokens: Token[]): ReportMarkdownParagraph {
-  const type = tokens[0].type === HEADING_OPEN ? 'heading' : 'paragraph'
-  const level = tokens[0].markup.length
-  const _content = tokens[1].children
-  const children: ReportMarkdownParagraphChild[] = []
-
-  let start = 0
-  for (let i = 0; i < _content.length; i++) {
-    const _token = _content[i]
-    if (_token.type === 'code_inline' || _token.type === 'text') {
-      children.push(parseParagraphChild(_content.slice(start, i + 1)))
-      start = i + 1
-    }
-  }
-
-  return {
-    type,
-    level,
-    children,
-  }
-}
-
-export function parseSection(tokens: Token[]): ReportMarkdownSection {
-  const children: ReportMarkdownParagraph[] = []
-
-  let start = 0
-  for (let i = 0; i < tokens.length; i++) {
-    const token = tokens[i]
-    if ((token.type === HEADING_OPEN || token.type === PARAGRAPH_OPEN) && i > start) {
-      children.push(parseParagraph(tokens.slice(start, i)))
-      start = i
-    }
-  }
-  children.push(parseParagraph(tokens.slice(start)))
-
-  return {
-    type: 'section',
-    children,
-  }
-}
-
-export function parseContent(tokens: Token[]): ReportMarkdownSection[] {
-  const sections: ReportMarkdownSection[] = []
-
-  let start = 0
-  for (let i = 0; i < tokens.length; i++) {
-    const token = tokens[i]
-    // If heading 1.
-    if (token.type === HEADING_OPEN && token.markup.length === 1 && i > start) {
-      sections.push(parseSection(tokens.slice(start, i)))
-      start = i
-    }
-  }
-  sections.push(parseSection(tokens.slice(start)))
-
-  return sections
-}
-
-export function parse(markdown: string): ReportMarkdown {
-  const { data: frontmatter, content: rawContent } = matter(markdown)
-  const contentTokens = md.parse(rawContent, {})
-
-  const content = parseContent(contentTokens)
-
-  return {
-    raw: markdown,
-    frontmatter,
-    content,
-  }
-}
-
-const src = `# 111
-
-this is a paragraph with **\`strong\`**, *italic*, \`inline code\`, [hyperlink](https://syy11.cn), ~~delete~~, ==highlight==, $1 + 1 = 2$, a<sup>sup</sup><sub>sub</sub>, ![image](https://image.cn), footnotes[^foot][^note]
-
-\`\`\`javascript
-const a = 0
-\`\`\`
-
-$$
-1+2=3
-$$
-
-# Refs
-
-[^foot]: ref1
-[^note]: ref2`
-
-// eslint-disable-next-line no-console
-console.log(parse(src).content[0].children[1].children)
--- a/packages/parser/src/fs.ts
+++ b/packages/parser/src/fs.ts
@ -1,8 +0,0 @@
-import { promises as fs } from 'fs'
-import type { ReportMarkdown } from '@md-report/types'
-import { parse } from './core'
-
-export async function load(filepath: string, content?: string): Promise<ReportMarkdown> {
-  const markdown = content ?? await fs.readFile(filepath, 'utf8')
-  return parse(markdown)
-}
--- a/packages/parser/src/index.ts
+++ b/packages/parser/src/index.ts
@ -1,2 +1,49 @@
-export * from './core'
-export * from './fs'
+import type Token from 'markdown-it/lib/token'
+import MarkdownIt from 'markdown-it'
+import type { ISectionOptions, IStylesOptions, Paragraph, Table, TableOfContents } from 'docx'
+import { Document } from 'docx'
+import { sliceParagraph, sliceSection } from './utils'
+import { paragraphParser } from './paragraph'
+
+const md = new MarkdownIt()
+
+export function parse(props: { markdown: string; config: { meta: Record<string, any>; styles: IStylesOptions } }): Document {
+  const { markdown, config } = props
+  const { meta, styles } = config
+  // Get frontmatter.
+  // Get tokens.
+  const tokens: Token[] = md.parse(markdown, meta)
+  return parseDocument(tokens, styles)
+}
+
+export function parseDocument(tokens: Token[], styles: IStylesOptions): Document {
+  // Variables.
+  let pos = 0
+  const sections: ISectionOptions[] = []
+  // Split and parse sections.
+  while (pos < tokens.length) {
+    const { tokens: section, offset: nextPos } = sliceSection(tokens.slice(pos))
+    sections.push(parseSection(section))
+    pos = nextPos
+  }
+  return new Document({
+    styles,
+    sections,
+  })
+}
+
+export function parseSection(tokens: Token[]): ISectionOptions {
+  // Variables.
+  let pos = 0
+  const children: (Paragraph | Table | TableOfContents)[] = []
+  // Split and parse paragraphs.
+  while (pos < tokens.length) {
+    const { tokens: paragraph, offset: nextPos } = sliceParagraph(tokens.slice(pos))
+    const parser = paragraphParser[tokens[0].tag]
+    children.push(parser(paragraph))
+    pos = nextPos
+  }
+  return {
+    children,
+  }
+}
--- a/packages/parser/src/inline.ts
+++ b/packages/parser/src/inline.ts
@ -0,0 +1,94 @@
+import { readFileSync } from 'fs'
+import type { IImageOptions, IRunOptions, ParagraphChild } from 'docx'
+import { ImageRun, Paragraph, TextRun } from 'docx'
+import type Token from 'markdown-it/lib/token'
+import { sliceInlineText } from './utils'
+
+export function parseInline(props: { tokens: Token[]; style?: string }): Paragraph {
+  // Variables.
+  const { tokens, style = 'normal' } = props
+  const { children: childrenTokens } = tokens[0]
+  const { length } = childrenTokens || []
+  const children: ParagraphChild[] = []
+  let pos = 0
+  // Parse inline children.
+  while (pos < length) {
+    const { tokens: paragraphChild, offset: nextPos } = sliceInlineText(tokens.slice(pos))
+    if (tokens[0].tag === 'img')
+      children.push(parseImage(paragraphChild))
+    else
+      children.push(parseText(paragraphChild))
+    pos = nextPos
+  }
+  return new Paragraph({
+    style,
+    children,
+  })
+}
+
+export function parseText(tokens: Token[]): TextRun {
+  let options: IRunOptions = {}
+  tokens.forEach((token) => {
+    if (token.nesting >= 0) {
+      // Only deal with opening and text/code tokens.
+      switch (token.tag) {
+        // Bold.
+        case 'strong':
+          options = { ...options, bold: true }
+          break
+        // Italics
+        case 'em':
+          options = { ...options, italics: true }
+          break
+        // Subscript.
+        case 'sub':
+          options = { ...options, subScript: true }
+          break
+        // Superscript.
+        case 'sup':
+          options = { ...options, superScript: true }
+          break
+        // Strikethrough.
+        case 's':
+          options = { ...options, strike: true }
+          break
+        // Highlight.
+        case 'mark':
+          // TODO: Replace highlight color with env data.
+          options = { ...options, highlight: 'yellow' }
+          break
+        // Inline code.
+        case 'code':
+          // TODO: Replace code font with env data.
+          options = { ...options, font: {}, text: token.content }
+          break
+        // Normal text.
+        default:
+          options = { ...options, text: token.content }
+      }
+    }
+  })
+  return new TextRun(options)
+}
+
+export function parseImage(tokens: Token[]): ImageRun | TextRun {
+  const { attrGet, content } = tokens[0]
+  const src = attrGet('src')
+  if (!src) {
+    return new TextRun({
+      text: `[MD Report]: Image ${content} is not found.`,
+      bold: true,
+      color: 'red',
+      highlight: 'yellow',
+    })
+  }
+  const options: IImageOptions = {
+    data: readFileSync(src).toString('base64'),
+    // TODO: Replace width and height with config in image url.
+    transformation: {
+      width: 100,
+      height: 100,
+    },
+  }
+  return new ImageRun(options)
+}
--- a/packages/parser/src/paragraph.ts
+++ b/packages/parser/src/paragraph.ts
@ -0,0 +1,72 @@
+import type Token from 'markdown-it/lib/token'
+import { Paragraph, Table, TableCell, TableRow } from 'docx'
+import { sliceTableRow } from './utils'
+import { parseInline } from './inline'
+
+export function parseFence(tokens: Token[]): Paragraph {
+  // Variables.
+  const { content: text } = tokens[0]
+  return new Paragraph({
+    style: 'fence',
+    text,
+  })
+}
+
+export function parseTable(tokens: Token[]): Table {
+  // Variables
+  let pos = 0
+  const rows: TableRow[] = []
+  while (pos < tokens.length) {
+    const { tokens: tableRow, offset: nextPos } = sliceTableRow(tokens.slice(pos))
+    rows.push(parseTableRow(tableRow))
+    pos = nextPos
+  }
+  return new Table({
+    style: 'table',
+    rows,
+  })
+}
+
+export function parseTableRow(tokens: Token[]): TableRow {
+  const cells: Token[] = tokens.filter(token => token.type === 'inline')
+  const children: TableCell[] = cells.map(cell => new TableCell({
+    children: [parseInline({
+      tokens: [cell],
+      style: 'table',
+    })],
+  }))
+  return new TableRow({
+    children,
+  })
+}
+
+export function parseParagraph(tokens: Token[]): Paragraph {
+  const inline = tokens.filter(token => token.type === 'inline')
+  return parseInline({
+    tokens: inline,
+    style: 'normal',
+  })
+}
+
+export function parseHeading(tokens: Token[]): Paragraph {
+  // Inline token.
+  const inline = tokens.filter(token => token.type === 'inline')
+  // Heading level.
+  const { length } = tokens[0].markup
+  return parseInline({
+    tokens: inline,
+    style: `heading${length}`,
+  })
+}
+
+export const paragraphParser: Record<string, (tokens: Token[]) => (Paragraph|Table)> = {
+  code: parseFence,
+  table: parseTable,
+  p: parseParagraph,
+  h1: parseHeading,
+  h2: parseHeading,
+  h3: parseHeading,
+  h4: parseHeading,
+  h5: parseHeading,
+  h6: parseHeading,
+}
--- a/packages/parser/src/utils.ts
+++ b/packages/parser/src/utils.ts
@ -1,55 +1,53 @@
-import type { IRunOptions } from 'docx'
-import type { MarkdownItTokenType } from '@md-report/types'
-import Token = require('markdown-it/lib/token')
-import { KAI_TI_FIRA_CODE_FONTS } from './constants'
+import type Token from 'markdown-it/lib/token'

-export function getParagraphChildType(token: Token): 'image' | 'text' {
-  switch (token.type) {
-    case 'image':
-      return 'image'
-    default:
-      return 'text'
+export interface SliceResult {
+  tokens: Token[]
+  offset: number
+}
+
+export function sliceSection(tokens: Token[]): SliceResult {
+  let offset = 0
+  if (tokens[0].tag === 'h1') {
+    while (tokens[offset].nesting >= 0 || tokens[offset].tag !== 'h1')
+      offset++
+  }
+  return {
+    tokens: tokens.slice(0, offset + 1),
+    offset: offset + 1,
  }
 }

-export function getParagraphChildConfig(tokens: Token[]): IRunOptions {
-  let config: IRunOptions = {}
+export function sliceParagraph(tokens: Token[]): SliceResult {
+  let offset = 0
+  // Code block.
+  if (tokens[0].type !== 'fence') {
+    // Normal paragraphs.
+    while (tokens[offset].level > 0 || tokens[offset].nesting >= 0)
+      offset++
+  }
+  // Return paragraph tokens.
+  return {
+    tokens: tokens.slice(0, offset + 1),
+    offset: offset + 1,
+  }
+}

-  for (let i = 0; i < tokens.length; i++) {
-    const token = tokens[i]
-    switch (token.type as MarkdownItTokenType) {
-      case 'em_open': {
-        config = { ...config, italics: true }
-        break
-      }
-      case 'strong_open': {
-        config = { ...config, bold: true }
-        break
-      }
-      case 'mark_open': {
-        config = { ...config, shading: { fill: '#bbbbbb' }, style: 'mark' }
-        break
-      }
-      case 'html_inline': {
-        if (token.content === '<sup>')
-          config = { ...config, superScript: true }
-        if (token.content === '<sub>')
-          config = { ...config, subScript: true }
-        break
-      }
-      case 's_open': {
-        config = { ...config, strike: true }
-        break
-      }
-      case 'code_inline': {
-        config = { ...config, font: KAI_TI_FIRA_CODE_FONTS, style: 'code' }
-        break
-      }
-      case 'text': {
-        config = { ...config, text: token.content }
-      }
+export function sliceTableRow(tokens: Token[]): SliceResult {
+  let offset = 0
+  while (tokens[offset].type !== 'tr_open')
+    offset++
+  return {
+    tokens: tokens.slice(0, offset),
+    offset,
+  }
+}
+
+export function sliceInlineText(tokens: Token[]): SliceResult {
+  if (tokens[0].tag === 'img' || tokens[0].tag === 'code') {
+    return {
+      tokens: tokens.slice(0, 1),
+      offset: 1,
    }
  }
-
-  return config
+  return sliceParagraph(tokens)
 }