feat(parser): parse markdown-it to schema

This commit is contained in:
syy11cn 2022-04-16 17:16:34 +08:00
parent 37b665166f
commit d68e6d9e2e
3 changed files with 166 additions and 3 deletions

View File

@ -0,0 +1,9 @@
// MarkdownIt token types.
export const HEADING_OPEN = 'heading_open'
export const PARAGRAPH_OPEN = 'paragraph_open'
// Style
export const KAI_TI_FIRA_CODE_FONTS = {
ascii: 'Fira Code',
eastAsia: 'KaiTi',
}

View File

@ -1,8 +1,11 @@
import YAML = require('js-yaml')
import { isObject } from '@antfu/utils'
import MarkdownIt = require('markdown-it')
import type {} from '@md-report/parser'
import type { ReportConfig, ReportMarkdown } from '@md-report/types'
import type { ReportConfig, ReportMarkdown, ReportMarkdownParagraph, ReportMarkdownParagraphChild, ReportMarkdownSection } from '@md-report/types'
import Token = require('markdown-it/lib/token')
import type { IRunOptions } from 'docx'
import { HEADING_OPEN, PARAGRAPH_OPEN } from './constants'
import { getParagraphChildConfig, getParagraphChildType } from './utils'
const md = MarkdownIt({ html: true })
@ -18,12 +21,108 @@ export function matter(code: string): { data: ReportConfig; content: string } {
return { data, content }
}
export function parseParagraphChild(tokens: Token[]): ReportMarkdownParagraphChild {
// Get rid of closing tags.
let i = 0
while (tokens[i].type.includes('close') || tokens[i].content.match(/\<\/[^]*?\>/))
i++
const type = getParagraphChildType(tokens[i])
const config: IRunOptions = getParagraphChildConfig(tokens.slice(i))
return {
type,
config,
}
}
export function parseParagraph(tokens: Token[]): ReportMarkdownParagraph {
const type = tokens[0].type === HEADING_OPEN ? 'heading' : 'paragraph'
const level = tokens[0].markup.length
const _content = tokens[1].children
const children: ReportMarkdownParagraphChild[] = []
let start = 0
for (let i = 0; i < _content.length; i++) {
const _token = _content[i]
if (_token.type === 'code_inline' || _token.type === 'text') {
children.push(parseParagraphChild(_content.slice(start, i + 1)))
start = i + 1
}
}
return {
type,
level,
children,
}
}
export function parseSection(tokens: Token[]): ReportMarkdownSection {
const children: ReportMarkdownParagraph[] = []
let start = 0
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]
if ((token.type === HEADING_OPEN || token.type === PARAGRAPH_OPEN) && i > start) {
children.push(parseParagraph(tokens.slice(start, i)))
start = i
}
}
children.push(parseParagraph(tokens.slice(start)))
return {
type: 'section',
children,
}
}
export function parseContent(tokens: Token[]): ReportMarkdownSection[] {
const sections: ReportMarkdownSection[] = []
let start = 0
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]
// If heading 1.
if (token.type === HEADING_OPEN && token.markup.length === 1 && i > start) {
sections.push(parseSection(tokens.slice(start, i)))
start = i
}
}
sections.push(parseSection(tokens.slice(start)))
return sections
}
export function parse(markdown: string): ReportMarkdown {
const { data: frontmatter, content: rawContent } = matter(markdown)
const content = md.parse(rawContent, null)
const contentTokens = md.parse(rawContent, {})
const content = parseContent(contentTokens)
return {
raw: markdown,
frontmatter,
content,
}
}
const src = `# 111
this is a paragraph with **\`strong\`**, *italic*, \`inline code\`, [hyperlink](https://syy11.cn), ~~delete~~, ==highlight==, $1 + 1 = 2$, a<sup>sup</sup><sub>sub</sub>, ![image](https://image.cn), footnotes[^foot][^note]
\`\`\`javascript
const a = 0
\`\`\`
$$
1+2=3
$$
# Refs
[^foot]: ref1
[^note]: ref2`
// eslint-disable-next-line no-console
console.log(parse(src).content[0].children[1].children)

View File

@ -0,0 +1,55 @@
import type { IRunOptions } from 'docx'
import type { MarkdownItTokenType } from '@md-report/types'
import Token = require('markdown-it/lib/token')
import { KAI_TI_FIRA_CODE_FONTS } from './constants'
export function getParagraphChildType(token: Token): 'image' | 'text' {
switch (token.type) {
case 'image':
return 'image'
default:
return 'text'
}
}
export function getParagraphChildConfig(tokens: Token[]): IRunOptions {
let config: IRunOptions = {}
for (let i = 0; i < tokens.length; i++) {
const token = tokens[i]
switch (token.type as MarkdownItTokenType) {
case 'em_open': {
config = { ...config, italics: true }
break
}
case 'strong_open': {
config = { ...config, bold: true }
break
}
case 'mark_open': {
config = { ...config, shading: { fill: '#bbbbbb' }, style: 'mark' }
break
}
case 'html_inline': {
if (token.content === '<sup>')
config = { ...config, superScript: true }
if (token.content === '<sub>')
config = { ...config, subScript: true }
break
}
case 's_open': {
config = { ...config, strike: true }
break
}
case 'code_inline': {
config = { ...config, font: KAI_TI_FIRA_CODE_FONTS, style: 'code' }
break
}
case 'text': {
config = { ...config, text: token.content }
}
}
}
return config
}