diff --git a/packages/parser/README.md b/packages/parser/README.md index ee30ef3..0a3de96 100644 --- a/packages/parser/README.md +++ b/packages/parser/README.md @@ -1,6 +1,6 @@ # @md-report/parser -Transfer plain markdown text to md-report data structure. +Transfer plain markdown text to markdown tokens. ## License diff --git a/packages/parser/package.json b/packages/parser/package.json index 0445d89..bf7231d 100644 --- a/packages/parser/package.json +++ b/packages/parser/package.json @@ -20,6 +20,7 @@ "prepublishOnly": "nr build" }, "dependencies": { + "docx": "^7.3.0", "js-yaml": "^4.1.0" } } diff --git a/packages/parser/src/constants.ts b/packages/parser/src/constants.ts deleted file mode 100644 index 9a8ceb8..0000000 --- a/packages/parser/src/constants.ts +++ /dev/null @@ -1,9 +0,0 @@ -// MarkdownIt token types. -export const HEADING_OPEN = 'heading_open' -export const PARAGRAPH_OPEN = 'paragraph_open' - -// Style -export const KAI_TI_FIRA_CODE_FONTS = { - ascii: 'Fira Code', - eastAsia: 'KaiTi', -} diff --git a/packages/parser/src/core.ts b/packages/parser/src/core.ts deleted file mode 100644 index b21bec8..0000000 --- a/packages/parser/src/core.ts +++ /dev/null @@ -1,128 +0,0 @@ -import YAML = require('js-yaml') -import { isObject } from '@antfu/utils' -import MarkdownIt = require('markdown-it') -import type { ReportConfig, ReportMarkdown, ReportMarkdownParagraph, ReportMarkdownParagraphChild, ReportMarkdownSection } from '@md-report/types' -import Token = require('markdown-it/lib/token') -import type { IRunOptions } from 'docx' -import { HEADING_OPEN, PARAGRAPH_OPEN } from './constants' -import { getParagraphChildConfig, getParagraphChildType } from './utils' - -const md = MarkdownIt({ html: true }) - -export function matter(code: string): { data: ReportConfig; content: string } { - let data: any = {} - const content = code.replace(/^---.*\r?\n([\s\S]*?)---/, - (_, d) => { - data = YAML.load(d) - if (!isObject(data)) - data = {} - return '' - }) - return { data, content } -} - -export function parseParagraphChild(tokens: Token[]): ReportMarkdownParagraphChild { - // Get rid of closing tags. - let i = 0 - while (tokens[i].type.includes('close') || tokens[i].content.match(/\<\/[^]*?\>/)) - i++ - - const type = getParagraphChildType(tokens[i]) - const config: IRunOptions = getParagraphChildConfig(tokens.slice(i)) - - return { - type, - config, - } -} - -export function parseParagraph(tokens: Token[]): ReportMarkdownParagraph { - const type = tokens[0].type === HEADING_OPEN ? 'heading' : 'paragraph' - const level = tokens[0].markup.length - const _content = tokens[1].children - const children: ReportMarkdownParagraphChild[] = [] - - let start = 0 - for (let i = 0; i < _content.length; i++) { - const _token = _content[i] - if (_token.type === 'code_inline' || _token.type === 'text') { - children.push(parseParagraphChild(_content.slice(start, i + 1))) - start = i + 1 - } - } - - return { - type, - level, - children, - } -} - -export function parseSection(tokens: Token[]): ReportMarkdownSection { - const children: ReportMarkdownParagraph[] = [] - - let start = 0 - for (let i = 0; i < tokens.length; i++) { - const token = tokens[i] - if ((token.type === HEADING_OPEN || token.type === PARAGRAPH_OPEN) && i > start) { - children.push(parseParagraph(tokens.slice(start, i))) - start = i - } - } - children.push(parseParagraph(tokens.slice(start))) - - return { - type: 'section', - children, - } -} - -export function parseContent(tokens: Token[]): ReportMarkdownSection[] { - const sections: ReportMarkdownSection[] = [] - - let start = 0 - for (let i = 0; i < tokens.length; i++) { - const token = tokens[i] - // If heading 1. - if (token.type === HEADING_OPEN && token.markup.length === 1 && i > start) { - sections.push(parseSection(tokens.slice(start, i))) - start = i - } - } - sections.push(parseSection(tokens.slice(start))) - - return sections -} - -export function parse(markdown: string): ReportMarkdown { - const { data: frontmatter, content: rawContent } = matter(markdown) - const contentTokens = md.parse(rawContent, {}) - - const content = parseContent(contentTokens) - - return { - raw: markdown, - frontmatter, - content, - } -} - -const src = `# 111 - -this is a paragraph with **\`strong\`**, *italic*, \`inline code\`, [hyperlink](https://syy11.cn), ~~delete~~, ==highlight==, $1 + 1 = 2$, asupsub, ![image](https://image.cn), footnotes[^foot][^note] - -\`\`\`javascript -const a = 0 -\`\`\` - -$$ -1+2=3 -$$ - -# Refs - -[^foot]: ref1 -[^note]: ref2` - -// eslint-disable-next-line no-console -console.log(parse(src).content[0].children[1].children) diff --git a/packages/parser/src/fs.ts b/packages/parser/src/fs.ts deleted file mode 100644 index 85568fb..0000000 --- a/packages/parser/src/fs.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { promises as fs } from 'fs' -import type { ReportMarkdown } from '@md-report/types' -import { parse } from './core' - -export async function load(filepath: string, content?: string): Promise { - const markdown = content ?? await fs.readFile(filepath, 'utf8') - return parse(markdown) -} diff --git a/packages/parser/src/index.ts b/packages/parser/src/index.ts index e4137d3..b39a9e8 100644 --- a/packages/parser/src/index.ts +++ b/packages/parser/src/index.ts @@ -1,2 +1,49 @@ -export * from './core' -export * from './fs' +import type Token from 'markdown-it/lib/token' +import MarkdownIt from 'markdown-it' +import type { ISectionOptions, IStylesOptions, Paragraph, Table, TableOfContents } from 'docx' +import { Document } from 'docx' +import { sliceParagraph, sliceSection } from './utils' +import { paragraphParser } from './paragraph' + +const md = new MarkdownIt() + +export function parse(props: { markdown: string; config: { meta: Record; styles: IStylesOptions } }): Document { + const { markdown, config } = props + const { meta, styles } = config + // Get frontmatter. + // Get tokens. + const tokens: Token[] = md.parse(markdown, meta) + return parseDocument(tokens, styles) +} + +export function parseDocument(tokens: Token[], styles: IStylesOptions): Document { + // Variables. + let pos = 0 + const sections: ISectionOptions[] = [] + // Split and parse sections. + while (pos < tokens.length) { + const { tokens: section, offset: nextPos } = sliceSection(tokens.slice(pos)) + sections.push(parseSection(section)) + pos = nextPos + } + return new Document({ + styles, + sections, + }) +} + +export function parseSection(tokens: Token[]): ISectionOptions { + // Variables. + let pos = 0 + const children: (Paragraph | Table | TableOfContents)[] = [] + // Split and parse paragraphs. + while (pos < tokens.length) { + const { tokens: paragraph, offset: nextPos } = sliceParagraph(tokens.slice(pos)) + const parser = paragraphParser[tokens[0].tag] + children.push(parser(paragraph)) + pos = nextPos + } + return { + children, + } +} diff --git a/packages/parser/src/inline.ts b/packages/parser/src/inline.ts new file mode 100644 index 0000000..b53cac1 --- /dev/null +++ b/packages/parser/src/inline.ts @@ -0,0 +1,94 @@ +import { readFileSync } from 'fs' +import type { IImageOptions, IRunOptions, ParagraphChild } from 'docx' +import { ImageRun, Paragraph, TextRun } from 'docx' +import type Token from 'markdown-it/lib/token' +import { sliceInlineText } from './utils' + +export function parseInline(props: { tokens: Token[]; style?: string }): Paragraph { + // Variables. + const { tokens, style = 'normal' } = props + const { children: childrenTokens } = tokens[0] + const { length } = childrenTokens || [] + const children: ParagraphChild[] = [] + let pos = 0 + // Parse inline children. + while (pos < length) { + const { tokens: paragraphChild, offset: nextPos } = sliceInlineText(tokens.slice(pos)) + if (tokens[0].tag === 'img') + children.push(parseImage(paragraphChild)) + else + children.push(parseText(paragraphChild)) + pos = nextPos + } + return new Paragraph({ + style, + children, + }) +} + +export function parseText(tokens: Token[]): TextRun { + let options: IRunOptions = {} + tokens.forEach((token) => { + if (token.nesting >= 0) { + // Only deal with opening and text/code tokens. + switch (token.tag) { + // Bold. + case 'strong': + options = { ...options, bold: true } + break + // Italics + case 'em': + options = { ...options, italics: true } + break + // Subscript. + case 'sub': + options = { ...options, subScript: true } + break + // Superscript. + case 'sup': + options = { ...options, superScript: true } + break + // Strikethrough. + case 's': + options = { ...options, strike: true } + break + // Highlight. + case 'mark': + // TODO: Replace highlight color with env data. + options = { ...options, highlight: 'yellow' } + break + // Inline code. + case 'code': + // TODO: Replace code font with env data. + options = { ...options, font: {}, text: token.content } + break + // Normal text. + default: + options = { ...options, text: token.content } + } + } + }) + return new TextRun(options) +} + +export function parseImage(tokens: Token[]): ImageRun | TextRun { + const { attrGet, content } = tokens[0] + const src = attrGet('src') + if (!src) { + return new TextRun({ + text: `[MD Report]: Image ${content} is not found.`, + bold: true, + color: 'red', + highlight: 'yellow', + }) + } + const options: IImageOptions = { + data: readFileSync(src).toString('base64'), + // TODO: Replace width and height with config in image url. + transformation: { + width: 100, + height: 100, + }, + } + return new ImageRun(options) +} diff --git a/packages/parser/src/paragraph.ts b/packages/parser/src/paragraph.ts new file mode 100644 index 0000000..e14b7ab --- /dev/null +++ b/packages/parser/src/paragraph.ts @@ -0,0 +1,72 @@ +import type Token from 'markdown-it/lib/token' +import { Paragraph, Table, TableCell, TableRow } from 'docx' +import { sliceTableRow } from './utils' +import { parseInline } from './inline' + +export function parseFence(tokens: Token[]): Paragraph { + // Variables. + const { content: text } = tokens[0] + return new Paragraph({ + style: 'fence', + text, + }) +} + +export function parseTable(tokens: Token[]): Table { + // Variables + let pos = 0 + const rows: TableRow[] = [] + while (pos < tokens.length) { + const { tokens: tableRow, offset: nextPos } = sliceTableRow(tokens.slice(pos)) + rows.push(parseTableRow(tableRow)) + pos = nextPos + } + return new Table({ + style: 'table', + rows, + }) +} + +export function parseTableRow(tokens: Token[]): TableRow { + const cells: Token[] = tokens.filter(token => token.type === 'inline') + const children: TableCell[] = cells.map(cell => new TableCell({ + children: [parseInline({ + tokens: [cell], + style: 'table', + })], + })) + return new TableRow({ + children, + }) +} + +export function parseParagraph(tokens: Token[]): Paragraph { + const inline = tokens.filter(token => token.type === 'inline') + return parseInline({ + tokens: inline, + style: 'normal', + }) +} + +export function parseHeading(tokens: Token[]): Paragraph { + // Inline token. + const inline = tokens.filter(token => token.type === 'inline') + // Heading level. + const { length } = tokens[0].markup + return parseInline({ + tokens: inline, + style: `heading${length}`, + }) +} + +export const paragraphParser: Record (Paragraph|Table)> = { + code: parseFence, + table: parseTable, + p: parseParagraph, + h1: parseHeading, + h2: parseHeading, + h3: parseHeading, + h4: parseHeading, + h5: parseHeading, + h6: parseHeading, +} diff --git a/packages/parser/src/utils.ts b/packages/parser/src/utils.ts index 1d95b51..9bf1e6c 100644 --- a/packages/parser/src/utils.ts +++ b/packages/parser/src/utils.ts @@ -1,55 +1,53 @@ -import type { IRunOptions } from 'docx' -import type { MarkdownItTokenType } from '@md-report/types' -import Token = require('markdown-it/lib/token') -import { KAI_TI_FIRA_CODE_FONTS } from './constants' +import type Token from 'markdown-it/lib/token' -export function getParagraphChildType(token: Token): 'image' | 'text' { - switch (token.type) { - case 'image': - return 'image' - default: - return 'text' +export interface SliceResult { + tokens: Token[] + offset: number +} + +export function sliceSection(tokens: Token[]): SliceResult { + let offset = 0 + if (tokens[0].tag === 'h1') { + while (tokens[offset].nesting >= 0 || tokens[offset].tag !== 'h1') + offset++ + } + return { + tokens: tokens.slice(0, offset + 1), + offset: offset + 1, } } -export function getParagraphChildConfig(tokens: Token[]): IRunOptions { - let config: IRunOptions = {} +export function sliceParagraph(tokens: Token[]): SliceResult { + let offset = 0 + // Code block. + if (tokens[0].type !== 'fence') { + // Normal paragraphs. + while (tokens[offset].level > 0 || tokens[offset].nesting >= 0) + offset++ + } + // Return paragraph tokens. + return { + tokens: tokens.slice(0, offset + 1), + offset: offset + 1, + } +} - for (let i = 0; i < tokens.length; i++) { - const token = tokens[i] - switch (token.type as MarkdownItTokenType) { - case 'em_open': { - config = { ...config, italics: true } - break - } - case 'strong_open': { - config = { ...config, bold: true } - break - } - case 'mark_open': { - config = { ...config, shading: { fill: '#bbbbbb' }, style: 'mark' } - break - } - case 'html_inline': { - if (token.content === '') - config = { ...config, superScript: true } - if (token.content === '') - config = { ...config, subScript: true } - break - } - case 's_open': { - config = { ...config, strike: true } - break - } - case 'code_inline': { - config = { ...config, font: KAI_TI_FIRA_CODE_FONTS, style: 'code' } - break - } - case 'text': { - config = { ...config, text: token.content } - } +export function sliceTableRow(tokens: Token[]): SliceResult { + let offset = 0 + while (tokens[offset].type !== 'tr_open') + offset++ + return { + tokens: tokens.slice(0, offset), + offset, + } +} + +export function sliceInlineText(tokens: Token[]): SliceResult { + if (tokens[0].tag === 'img' || tokens[0].tag === 'code') { + return { + tokens: tokens.slice(0, 1), + offset: 1, } } - - return config + return sliceParagraph(tokens) }