Import Upstream version 2.1.0

This commit is contained in:
su-fang 2023-03-17 15:15:55 +08:00
commit 6cb7cfa55b
154 changed files with 53184 additions and 0 deletions

48
.github/workflows/benchmark.yml vendored Normal file
View File

@ -0,0 +1,48 @@
name: benchmark
on:
push:
branches: [master]
jobs:
benchmark-packages:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: install pandoc
uses: r-lib/actions/setup-pandoc@v1
with:
pandoc-version: '2.6'
- name: Install tox
run: |
python -m pip install --upgrade pip
pip install tox
- name: Run package benchmarks
run: tox -e py38-bench-packages -- --benchmark-min-rounds 20 --benchmark-json bench-packages.json
# - name: Upload package data
# uses: actions/upload-artifact@v2
# with:
# name: bench-packages
# path: bench-packages.json
# if-no-files-found: error
- name: Store benchmark result
uses: aiidateam/github-action-benchmark@v2
with:
name: Parsing Benchmarks
output-file-path: bench-packages.json
github-token: ${{ secrets.GITHUB_TOKEN }}
auto-push: true
commit-msg-append: "[ci skip]"
one-chart-groups: packages,plugins
fail-on-alert: false

136
.github/workflows/tests.yml vendored Normal file
View File

@ -0,0 +1,136 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: continuous-integration
on:
push:
branches: [master]
tags:
- "v[0-9]+.[0-9]+.[0-9]+*"
pull_request:
schedule:
- cron: '0 0 * * 0' # every week
jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: '3.8'
- uses: pre-commit/action@v2.0.0
tests:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['pypy-3.7', '3.7', '3.8', '3.9', '3.10']
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install .[testing,linkify]
- name: Run pytest
run: |
pytest tests/ --cov=markdown_it --cov-report=xml --cov-report=term-missing
- name: Upload to Codecov
if: matrix.python-version == '3.7' && github.repository == 'executablebooks/markdown-it-py'
uses: codecov/codecov-action@v1
with:
name: markdown-it-py-pytests-py3.7
flags: pytests
file: ./coverage.xml
fail_ci_if_error: true
test-plugins:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.8']
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install markdown-it-py
run: |
python -m pip install --upgrade pip
pip install .[testing]
- name: clone and install mdit-py-plugins
run: |
git clone https://github.com/executablebooks/mdit-py-plugins.git
pip install --no-deps -e mdit-py-plugins
- name: Run pytest for unit tests of mdit-py-plugins
run: cd mdit-py-plugins; pytest
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: Install tox
run: |
python -m pip install --upgrade pip
pip install tox
- name: Run benchmark
run: tox -e py38-bench-core -- --benchmark-json bench-core.json
- name: Upload data
uses: actions/upload-artifact@v2
with:
name: bench-core
path: bench-core.json
if-no-files-found: error
publish:
name: Publish to PyPi
needs: [pre-commit, tests]
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
runs-on: ubuntu-latest
steps:
- name: Checkout source
uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: '3.8'
- name: install flit
run: |
pip install flit~=3.4
- name: Build and publish
run: |
flit publish
env:
FLIT_USERNAME: __token__
FLIT_PASSWORD: ${{ secrets.PYPI_KEY }}
allgood:
runs-on: ubuntu-latest
needs:
- pre-commit
- tests
steps:
- run: echo "Great success!"

143
.gitignore vendored Normal file
View File

@ -0,0 +1,143 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
benchmark/extra/
node_modules/
coverage/
demo/
apidoc/
*.log
__pycache__/
.ropeproject/
*.egg-info/
.vscode/
.DS_Store
docs/api/

46
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,46 @@
# Install pre-commit hooks via
# pre-commit install
exclude: >
(?x)^(
\.vscode/settings\.json|
test.*\.md|
test.*\.txt|
test.*\.html|
test.*\.xml|
.*commonmark\.json|
benchmark/.*\.md|
.*/spec\.md
)$
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0
hooks:
- id: check-json
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/pycqa/isort
rev: 5.10.1
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2
hooks:
- id: flake8
additional_dependencies: [flake8-bugbear==21.3.1]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.942
hooks:
- id: mypy
additional_dependencies: [attrs]

15
.readthedocs.yml Normal file
View File

@ -0,0 +1,15 @@
version: 2
python:
version: 3
install:
- method: pip
path: .
extra_requirements:
- linkify
- plugins
- rtd
sphinx:
builder: html
fail_on_warning: true

266
CHANGELOG.md Normal file
View File

@ -0,0 +1,266 @@
# Change Log
## 2.1.0 - 2022-04-15
This release is primarily to replace the `attrs` package dependency,
with the built-in Python `dataclasses` package.
This should not be a breaking change, for most use cases.
- ⬆️ UPGRADE: Drop support for EOL Python 3.6 (#194)
- ♻️ REFACTOR: Move `Rule`/`Delimiter` classes from `attrs` to `dataclass` (#211)
- ♻️ REFACTOR: Move `Token` class from `attrs` to `dataclass` (#211)
- ‼️ Remove deprecated `NestedTokens` and `nest_tokens`
- ✨ NEW: Save ordered list numbering (#192)
- 🐛 FIX: Combination of blockquotes, list and newlines causes `IndexError` (#207)
## 2.0.1 - 2022-24-01
- 🐛 FIX: Crash when file ends with empty blockquote line.
- ✨ NEW: Add `inline_definitions` option.
This option allows for `definition` token to be inserted into the token stream, at the point where the definition is located in the source text.
It is useful for cases where one wishes to capture a "loseless" syntax tree of the parsed Markdown (in conjunction with the `store_labels` option).
## 2.0.0 - 2021-12-03
- ⬆️ Update: Sync with markdown-it v12.1.0 and CommonMark v0.30
- ♻️ REFACTOR: Port `mdurl` and `punycode` for URL normalisation (thanks to @hukkin!).
This port fixes the outstanding CommonMark compliance tests.
- ♻️ REFACTOR: Remove `AttrDict`.
This is no longer used is core or mdit-py-plugins, instead standard dictionaries are used.
- 👌 IMPROVE: Use `__all__` to signal re-exports
## 1.1.0 - 2021-05-08
⬆️ UPGRADE: `attrs` -> v21 (#165)
This release has no breaking changes
(see: <https://github.com/python-attrs/attrs/blob/main/CHANGELOG.rst>)
## 1.0.0 - 2021-05-02
[Full commit log](https://github.com/executablebooks/markdown-it-py/compare/v0.6.2...v1.0.0)
The first stable release of markdown-it-py 🎉
See the changes in the beta releases below,
thanks to all the [contributors](https://github.com/executablebooks/markdown-it-py/graphs/contributors?from=2020-03-22&to=2021-05-02&type=c) in the last year!
## 1.0.0b3 - 2021-05-01
- 👌 IMPROVE: Add `RendererProtocol` type, for typing renderers (thanks to [@hukkinj1](https://github.com/hukkinj1))
- 🔧 MAINTAIN: `None` is no longer allowed as a valid `src` input for `StateBase` subclasses
## 1.0.0b2 - 2021-04-25
‼️ BREAKING: Move `mdit-py-plugins` out of the core install requirements and into a `plugins` extra.
Synchronised code with the upstream Markdown-It `v12.0.6`:
- 🐛 FIX: Raise HTML blocks priority to resolve conflict with headings
- 🐛 FIX: Newline not rendered in image alt attribute
## 1.0.0b1 - 2021-03-31
[Full commit log](https://github.com/executablebooks/markdown-it-py/compare/v0.6.2...9ecda04)
This is the first beta release of the stable v1.x series.
There are four notable (and breaking) changes:
1. The code has been synchronised with the upstream Markdown-It `v12.0.4`.
In particular, this update alters the parsing of tables to be consistent with the GFM specification: <https://github.github.com/gfm/#tables-extension->
A number of parsing performance and validation improvements are also included.
2. `Token.attrs` are now stored as dictionaries, rather than a list of lists.
This is a departure from upstream Markdown-It, allowed by Pythons guarantee of ordered dictionaries (see [#142](https://github.com/markdown-it/markdown-it/issues/142)), and is the more natural representation.
Note `attrGet`, `attrSet`, `attrPush` and `attrJoin` methods remain identical to those upstream,
and `Token.as_dict(as_upstream=True)` will convert the token back to a directly comparable dict.
3. The use of `AttrDict` has been replaced:
For `env` any Python mutable mapping is now allowed, and so attribute access to keys is not (differing from the Javascript dictionary).
For `MarkdownIt.options` it is now set as an `OptionsDict`, which is a dictionary sub-class, with attribute access only for core MarkdownIt configuration keys.
4. Introduction of the `SyntaxTreeNode`.
This is a more comprehensive replacement for `nest_tokens` and `NestedTokens` (which are now deprecated).
It allows for the `Token` stream to be converted to/from a nested tree structure, with opening/closing tokens collapsed into a single `SyntaxTreeNode` and the intermediate tokens set as children.
See [Creating a syntax tree](https://markdown-it-py.readthedocs.io/en/latest/using.html#creating-a-syntax-tree) documentation for details.
### Additional Fixes 🐛
- Fix exception due to empty lines after blockquote+footnote
- Fix linkify link nesting levels
- Fix the use of `Ruler.at` for plugins
- Avoid fenced token mutations during rendering
- Fix CLI version info and correct return of exit codes
## 0.6.2 - 2021-02-07
This release brings Markdown-It-Py inline with Markdown-It v11.0.1 (2020-09-14), applying two fixes:
- Fix blockquote lazy newlines, [[#696](https://github.com/markdown-it/markdown-it/issues/696)].
- Fix missed mappings for table rows, [[#705](https://github.com/markdown-it/markdown-it/issues/705)].
Thanks to [@hukkinj1](https://github.com/hukkinj1)!
## 0.6.1 - 2021-01-01
This release provides some improvements to the code base:
- 🐛 FIX: Do not resolve backslash escapes inside auto-links
- 🐛 FIX: Add content to image tokens
- 👌 IMPROVE: Add more type annotations, thanks to [@hukkinj1](https://github.com/hukkinj1)
## 0.6.0 - 2020-12-15
🗑 DEPRECATE: Move plugins to `mdit_py_plugins`
Plugins (in `markdown_it.extensions`) have now been moved to [executablebooks/mdit-py-plugins](https://github.com/executablebooks/mdit-py-plugins).
This will allow for their maintenance to occur on a different cycle to the core code, facilitating the release of a v1.0.0 for this package
🔧 MAINTAIN: Add [mypy](https://mypy.readthedocs.io) type-checking, thanks to [@hukkinj1](https://github.com/hukkinj1).
## 0.5.8 - 2020-12-13
✨ NEW: Add linkify, thanks to [@tsutsu3](https://github.com/tsutsu3).
This extension uses [linkify-it-py](https://github.com/tsutsu3/linkify-it-py) to identify URL links within text:
- `github.com` -> `<a href="http://github.com">github.com</a>`
**Important:** To use this extension you must install linkify-it-py; `pip install markdown-it-py[linkify]`
It can then be activated by:
```python
from markdown_it import MarkdownIt
md = MarkdownIt().enable("linkify")
md.options["linkify"] = True
```
## 0.5.7 - 2020-12-13
✨ NEW: Add smartquotes, thanks to [@tsutsu3](https://github.com/tsutsu3).
This extension will convert basic quote marks to their opening and closing variants:
- 'single quotes' -> single quotes
- "double quotes" -> “double quotes”
It can be activated by:
```python
from markdown_it import MarkdownIt
md = MarkdownIt().enable("smartquotes")
md.options["typographer"] = True
```
✨ NEW: Add markdown-it-task-lists plugin, thanks to [@wna-se](https://github.com/wna-se).
This is a port of the JS [markdown-it-task-lists](https://github.com/revin/markdown-it-task-lists),
for building task/todo lists out of markdown lists with items starting with `[ ]` or `[x]`.
For example:
```markdown
- [ ] An item that needs doing
- [x] An item that is complete
```
This plugin can be activated by:
```python
from markdown_it import MarkdownIt
from markdown_it.extensions.tasklists import tasklists_plugin
md = MarkdownIt().use(tasklists_plugin)
```
🐛 Various bug fixes, thanks to [@hukkinj1](https://github.com/hukkinj1):
- Do not copy empty `env` arg in `MarkdownIt.render`
- `_Entities.__contains__` fix return data
- Parsing of unicode ordinals
- Handling of final character in `skipSpacesBack` and `skipCharsBack` methods
- Avoid exception when document ends in heading/blockquote marker
🧪 TESTS: Add CI for Python 3.9 and PyPy3
## 0.5.6 - 2020-10-21
- ✨ NEW: Add simple typographic replacements, thanks to [@tsutsu3](https://github.com/tsutsu3):
This allows you to add the `typographer` option to the parser, to replace particular text constructs:
- ``(c)``, ``(C)`` → ©
- ``(tm)``, ``(TM)`` → ™
- ``(r)``, ``(R)`` → ®
- ``(p)``, ``(P)`` → §
- ``+-`` → ±
- ``...`` → …
- ``?....`` → ?..
- ``!....`` → !..
- ``????????`` → ???
- ``!!!!!`` → !!!
- ``,,,`` → ,
- ``--`` → &ndash
- ``---`` → &mdash
```python
md = MarkdownIt().enable("replacements")
md.options["typographer"] = True
```
- 📚 DOCS: Improve documentation for CLI, thanks to [@westurner](https://github.com/westurner)
- 👌 IMPROVE: Use `re.sub()` instead of `re.subn()[0]`, thanks to [@hukkinj1](https://github.com/hukkinj1)
- 🐛 FIX: An exception raised by having multiple blank lines at the end of some files
## 0.5.5 - 2020-09-27
👌 IMPROVE: Add `store_labels` option.
This allows for storage of original reference label in link/image token's metadata,
which can be useful for renderers.
## 0.5.4 - 2020-09-08
✨ NEW: Add `anchors_plugin` for headers, which can produce:
```html
<h1 id="title-string">Title String <a class="header-anchor" href="#title-string"></a></h1>
```
## 0.5.3 - 2020-09-04
🐛 Fixed an undefined variable in the reference block.
## 0.5.2 - 2020-08-22
🐛 Fixed an `IndexError` in `container_plugin`, when there is no newline on the closing tag line.
## 0.5.1 - 2020-08-21
⬆️ UPGRADE: attrs -> v20
This is not breaking, since it only deprecates Python 3.4 (see [CHANGELOG.rst](https://github.com/python-attrs/attrs/blob/master/CHANGELOG.rst))
## 0.5.0 - 2020-08-18
### Added ✨
- `deflist` and `dollarmath` plugins (see [plugins list](https://markdown-it-py.readthedocs.io/en/latest/plugins.html)).
### Improved 👌
- Added benchmarking tests and CI (see <https://executablebooks.github.io/markdown-it-py/dev/bench/>)
- Improved performance of computing ordinals (=> 10-15% parsing speed increase).
Thanks to [@sildar](https://github.com/sildar)!
### Fixed 🐛
- Stopped empty lines at the end of the document, after certain list blocks, raising an exception (#36).
- Allow myst-role to accept names containing digits (0-9).
## 0.4.9 - 2020-08-11
### Added ✨
- `containers` plugin (see [plugins list](https://markdown-it-py.readthedocs.io/en/latest/plugins.html))
### Documented 📚
- Plugins and improved contributing section

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 ExecutableBookProject
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

22
LICENSE.markdown-it Normal file
View File

@ -0,0 +1,22 @@
Copyright (c) 2014 Vitaly Puzrin, Alex Kocharin.
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.

148
README.md Normal file
View File

@ -0,0 +1,148 @@
# markdown-it-py
[![Github-CI][github-ci]][github-link]
[![Coverage Status][codecov-badge]][codecov-link]
[![PyPI][pypi-badge]][pypi-link]
[![Conda][conda-badge]][conda-link]
[![Code style: black][black-badge]][black-link]
[![PyPI - Downloads][install-badge]][install-link]
> Markdown parser done right.
- Follows the __[CommonMark spec](http://spec.commonmark.org/)__ for baseline parsing
- Configurable syntax: you can add new rules and even replace existing ones.
- Pluggable: Adds syntax extensions to extend the parser (see the [plugin list][md-plugins]).
- High speed (see our [benchmarking tests][md-performance])
- [Safe by default][md-security]
This is a Python port of [markdown-it], and some of its associated plugins.
For more details see: <https://markdown-it-py.readthedocs.io>.
For details on [markdown-it] itself, see:
- The __[Live demo](https://markdown-it.github.io)__
- [The markdown-it README][markdown-it-readme]
## Installation
```bash
conda install -c conda-forge markdown-it-py
```
or
```bash
pip install markdown-it-py[plugins]
```
or with extras
```bash
conda install -c conda-forge markdown-it-py linkify-it-py mdit-py-plugins
pip install markdown-it-py[linkify,plugins]
```
## Usage
### Python API Usage
Render markdown to HTML with markdown-it-py and a custom configuration
with and without plugins and features:
```python
from markdown_it import MarkdownIt
from mdit_py_plugins.front_matter import front_matter_plugin
from mdit_py_plugins.footnote import footnote_plugin
md = (
MarkdownIt()
.use(front_matter_plugin)
.use(footnote_plugin)
.disable('image')
.enable('table')
)
text = ("""
---
a: 1
---
a | b
- | -
1 | 2
A footnote [^1]
[^1]: some details
""")
tokens = md.parse(text)
html_text = md.render(text)
## To export the html to a file, uncomment the lines below:
# from pathlib import Path
# Path("output.html").write_text(html_text)
```
### Command-line Usage
Render markdown to HTML with markdown-it-py from the
command-line:
```console
usage: markdown-it [-h] [-v] [filenames [filenames ...]]
Parse one or more markdown files, convert each to HTML, and print to stdout
positional arguments:
filenames specify an optional list of files to convert
optional arguments:
-h, --help show this help message and exit
-v, --version show program's version number and exit
Interactive:
$ markdown-it
markdown-it-py [version 0.0.0] (interactive)
Type Ctrl-D to complete input, or Ctrl-C to exit.
>>> # Example
... > markdown *input*
...
<h1>Example</h1>
<blockquote>
<p>markdown <em>input</em></p>
</blockquote>
Batch:
$ markdown-it README.md README.footer.md > index.html
```
## References / Thanks
Big thanks to the authors of [markdown-it]:
- Alex Kocharin [github/rlidwka](https://github.com/rlidwka)
- Vitaly Puzrin [github/puzrin](https://github.com/puzrin)
Also [John MacFarlane](https://github.com/jgm) for his work on the CommonMark spec and reference implementations.
[github-ci]: https://github.com/executablebooks/markdown-it-py/workflows/Python%20package/badge.svg?branch=master
[github-link]: https://github.com/executablebooks/markdown-it-py
[pypi-badge]: https://img.shields.io/pypi/v/markdown-it-py.svg
[pypi-link]: https://pypi.org/project/markdown-it-py
[conda-badge]: https://anaconda.org/conda-forge/markdown-it-py/badges/version.svg
[conda-link]: https://anaconda.org/conda-forge/markdown-it-py
[codecov-badge]: https://codecov.io/gh/executablebooks/markdown-it-py/branch/master/graph/badge.svg
[codecov-link]: https://codecov.io/gh/executablebooks/markdown-it-py
[black-badge]: https://img.shields.io/badge/code%20style-black-000000.svg
[black-link]: https://github.com/ambv/black
[install-badge]: https://img.shields.io/pypi/dw/markdown-it-py?label=pypi%20installs
[install-link]: https://pypistats.org/packages/markdown-it-py
[CommonMark spec]: http://spec.commonmark.org/
[markdown-it]: https://github.com/markdown-it/markdown-it
[markdown-it-readme]: https://github.com/markdown-it/markdown-it/blob/master/README.md
[md-security]: https://markdown-it-py.readthedocs.io/en/latest/other.html
[md-performance]: https://markdown-it-py.readthedocs.io/en/latest/other.html
[md-plugins]: https://markdown-it-py.readthedocs.io/en/latest/plugins.html

View File

@ -0,0 +1,20 @@
from pathlib import Path
import pytest
import markdown_it
@pytest.fixture
def spec_text():
return Path(__file__).parent.joinpath("samples", "spec.md").read_text()
@pytest.fixture
def parser():
return markdown_it.MarkdownIt("commonmark")
@pytest.mark.benchmark(group="core")
def test_spec(benchmark, parser, spec_text):
benchmark(parser.render, spec_text)

View File

@ -0,0 +1,69 @@
from pathlib import Path
from shutil import which
import pytest
@pytest.fixture
def spec_text():
return Path(__file__).parent.joinpath("samples", "spec.md").read_text()
@pytest.mark.benchmark(group="packages")
def test_markdown_it_py(benchmark, spec_text):
import markdown_it
parser = markdown_it.MarkdownIt("commonmark")
benchmark.extra_info["version"] = markdown_it.__version__
benchmark(parser.render, spec_text)
@pytest.mark.benchmark(group="packages")
def test_mistune(benchmark, spec_text):
import mistune
benchmark.extra_info["version"] = mistune.__version__
benchmark(mistune.markdown, spec_text)
@pytest.mark.benchmark(group="packages")
def test_commonmark_py(benchmark, spec_text):
import commonmark
benchmark.extra_info["version"] = "0.9.1"
benchmark(commonmark.commonmark, spec_text)
@pytest.mark.benchmark(group="packages")
def test_pymarkdown(benchmark, spec_text):
import markdown
benchmark.extra_info["version"] = markdown.__version__
benchmark(markdown.markdown, spec_text)
@pytest.mark.benchmark(group="packages")
def test_pymarkdown_extra(benchmark, spec_text):
import markdown
benchmark.extra_info["version"] = markdown.__version__
benchmark(markdown.markdown, spec_text, extensions=["extra"])
@pytest.mark.benchmark(group="packages")
def test_mistletoe(benchmark, spec_text):
import mistletoe
benchmark.extra_info["version"] = mistletoe.__version__
benchmark(mistletoe.markdown, spec_text)
@pytest.mark.skipif(which("pandoc") is None, reason="pandoc executable not found")
@pytest.mark.benchmark(group="packages")
def test_panflute(benchmark, spec_text):
import panflute
benchmark.extra_info["version"] = panflute.__version__
benchmark(
panflute.convert_text, spec_text, input_format="markdown", output_format="html"
)

12
benchmarking/conftest.py Normal file
View File

@ -0,0 +1,12 @@
def pytest_benchmark_update_machine_info(config, machine_info):
import psutil
freq = psutil.cpu_freq()
machine_info["psutil"] = {
"cpu_count": psutil.cpu_count(logical=False),
"cpu_count_logical": psutil.cpu_count(logical=True),
"cpu_percent": psutil.cpu_percent(),
"cpu_freq_min": freq.min,
"cpu_freq_max": freq.max,
"cpu_freq_current": freq.current,
}

View File

@ -0,0 +1,15 @@
> the simple example of a blockquote
> the simple example of a blockquote
> the simple example of a blockquote
> the simple example of a blockquote
... continuation
... continuation
... continuation
... continuation
empty blockquote:
>
>
>
>

View File

@ -0,0 +1,13 @@
>>>>>> deeply nested blockquote
>>>>> deeply nested blockquote
>>>> deeply nested blockquote
>>> deeply nested blockquote
>> deeply nested blockquote
> deeply nested blockquote
> deeply nested blockquote
>> deeply nested blockquote
>>> deeply nested blockquote
>>>> deeply nested blockquote
>>>>> deeply nested blockquote
>>>>>> deeply nested blockquote

View File

@ -0,0 +1,10 @@
an
example
of
a code
block

View File

@ -0,0 +1,13 @@
``````````text
an
example
```
of
a fenced
```
code
block
``````````

View File

@ -0,0 +1,9 @@
# heading
### heading
##### heading
# heading #
### heading ###
##### heading \#\#\#\#\######
############ not a heading

View File

@ -0,0 +1,9 @@
* * * * *
- - - - -
________
************************* text

View File

@ -0,0 +1,31 @@
<div class="this is an html block">
blah blah
</div>
<table>
<tr>
<td>
**test**
</td>
</tr>
</table>
<table>
<tr>
<td>
test
</td>
</tr>
</table>
<![CDATA[
[[[[[[[[[[[... *cdata section - this should not be parsed* ...]]]]]]]]]]]
]]>

View File

@ -0,0 +1,8 @@
heading
---
heading
===================================
not a heading
----------------------------------- text

View File

@ -0,0 +1,67 @@
- tidy
- bullet
- list
- loose
- bullet
- list
0. ordered
1. list
2. example
-
-
-
-
1.
2.
3.
- an example
of a list item
with a continuation
this part is inside the list
this part is just a paragraph
1. test
- test
1. test
- test
111111111111111111111111111111111111111111. is this a valid bullet?
- _________________________
- this
- is
a
long
- loose
- list
- with
- some
tidy
- list
- items
- in
- between
- _________________________

View File

@ -0,0 +1,35 @@
- this
- is
- a
- deeply
- nested
- bullet
- list
1. this
2. is
3. a
4. deeply
5. nested
6. unordered
7. list
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 6
- 5
- 4
- 3
- 2
- 1
- - - - - - - - - deeply-nested one-element item

View File

@ -0,0 +1,15 @@
[1] [2] [3] [1] [2] [3]
[looooooooooooooooooooooooooooooooooooooooooooooooooong label]
[1]: <http://something.example.com/foo/bar>
[2]: http://something.example.com/foo/bar 'test'
[3]:
http://foo/bar
[ looooooooooooooooooooooooooooooooooooooooooooooooooong label ]:
111
'test'
[[[[[[[[[[[[[[[[[[[[ this should not slow down anything ]]]]]]]]]]]]]]]]]]]]: q
(as long as it is not referenced anywhere)
[[[[[[[[[[[[[[[[[[[[]: this is not a valid reference

View File

@ -0,0 +1,50 @@
[item 1]: <1>
[item 2]: <2>
[item 3]: <3>
[item 4]: <4>
[item 5]: <5>
[item 6]: <6>
[item 7]: <7>
[item 8]: <8>
[item 9]: <9>
[item 10]: <10>
[item 11]: <11>
[item 12]: <12>
[item 13]: <13>
[item 14]: <14>
[item 15]: <15>
[item 16]: <16>
[item 17]: <17>
[item 18]: <18>
[item 19]: <19>
[item 20]: <20>
[item 21]: <21>
[item 22]: <22>
[item 23]: <23>
[item 24]: <24>
[item 25]: <25>
[item 26]: <26>
[item 27]: <27>
[item 28]: <28>
[item 29]: <29>
[item 30]: <30>
[item 31]: <31>
[item 32]: <32>
[item 33]: <33>
[item 34]: <34>
[item 35]: <35>
[item 36]: <36>
[item 37]: <37>
[item 38]: <38>
[item 39]: <39>
[item 40]: <40>
[item 41]: <41>
[item 42]: <42>
[item 43]: <43>
[item 44]: <44>
[item 45]: <45>
[item 46]: <46>
[item 47]: <47>
[item 48]: <48>
[item 49]: <49>
[item 50]: <50>

View File

@ -0,0 +1,17 @@
[[[[[[[foo]]]]]]]
[[[[[[[foo]]]]]]]: bar
[[[[[[foo]]]]]]: bar
[[[[[foo]]]]]: bar
[[[[foo]]]]: bar
[[[foo]]]: bar
[[foo]]: bar
[foo]: bar
[*[*[*[*[foo]*]*]*]*]
[*[*[*[*[foo]*]*]*]*]: bar
[*[*[*[foo]*]*]*]: bar
[*[*[foo]*]*]: bar
[*[foo]*]: bar
[foo]: bar

View File

@ -0,0 +1,21 @@
| Heading 1 | Heading 2
| --------- | ---------
| Cell 1 | Cell 2
| Cell 3 | Cell 4
| Header 1 | Header 2 | Header 3 | Header 4 |
| :------: | -------: | :------- | -------- |
| Cell 1 | Cell 2 | Cell 3 | Cell 4 |
| Cell 5 | Cell 6 | Cell 7 | Cell 8 |
Test code
Header 1 | Header 2
-------- | --------
Cell 1 | Cell 2
Cell 3 | Cell 4
Header 1|Header 2|Header 3|Header 4
:-------|:------:|-------:|--------
Cell 1 |Cell 2 |Cell 3 |Cell 4
*Cell 5*|Cell 6 |Cell 7 |Cell 8

View File

@ -0,0 +1,14 @@
closed (valid) autolinks:
<ftp://1.2.3.4:21/path/foo>
<http://foo.bar.baz?q=hello&id=22&boolean>
<http://veeeeeeeeeeeeeeeeeeery.loooooooooooooooooooooooooooooooong.autolink/>
<teeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeest@gmail.com>
these are not autolinks:
<ftp://1.2.3.4:21/path/foo
<http://foo.bar.baz?q=hello&id=22&boolean
<http://veeeeeeeeeeeeeeeeeeery.loooooooooooooooooooooooooooooooong.autolink
<teeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeest@gmail.com
< http://foo.bar.baz?q=hello&id=22&boolean >

View File

@ -0,0 +1,3 @@
`lots`of`backticks`
``i``wonder``how``this``will``be``parsed``

View File

@ -0,0 +1,5 @@
*this* *is* *your* *basic* *boring* *emphasis*
_this_ _is_ _your_ _basic_ _boring_ _emphasis_
**this** **is** **your** **basic** **boring** **emphasis**

View File

@ -0,0 +1,5 @@
*this *is *a *bunch* of* nested* emphases*
__this __is __a __bunch__ of__ nested__ emphases__
***this ***is ***a ***bunch*** of*** nested*** emphases***

View File

@ -0,0 +1,5 @@
*this *is *a *worst *case *for *em *backtracking
__this __is __a __worst __case __for __em __backtracking
***this ***is ***a ***worst ***case ***for ***em ***backtracking

View File

@ -0,0 +1,11 @@
entities:
&nbsp; &amp; &copy; &AElig; &Dcaron; &frac34; &HilbertSpace; &DifferentialD; &ClockwiseContourIntegral;
&#35; &#1234; &#992; &#98765432;
non-entities:
&18900987654321234567890; &1234567890098765432123456789009876543212345678987654;
&qwertyuioppoiuytrewqwer; &oiuytrewqwertyuioiuytrewqwertyuioytrewqwertyuiiuytri;

View File

@ -0,0 +1,14 @@
\t\e\s\t\i\n\g \e\s\c\a\p\e \s\e\q\u\e\n\c\e\s
\!\\\"\#\$\%\&\'\(\)\*\+\,\.\/\:\;\<\=\>\?
\@ \[ \] \^ \_ \` \{ \| \} \~ \- \'
\
\\
\\\
\\\\
\\\\\
\<this\> \<is\> \<not\> \<html\>

View File

@ -0,0 +1,44 @@
Taking commonmark tests from the spec for benchmarking here:
<a><bab><c2c>
<a/><b2/>
<a /><b2
data="foo" >
<a foo="bar" bam = 'baz <em>"</em>'
_boolean zoop:33=zoop:33 />
<33> <__>
<a h*#ref="hi">
<a href="hi'> <a href=hi'>
< a><
foo><bar/ >
<a href='bar'title=title>
</a>
</foo >
</a href="foo">
foo <!-- this is a
comment - with hyphen -->
foo <!-- not a comment -- two hyphens -->
foo <?php echo $a; ?>
foo <!ELEMENT br EMPTY>
foo <![CDATA[>&<]]>
<a href="&ouml;">
<a href="\*">
<a href="\"">

View File

@ -0,0 +1,23 @@
Valid links:
[this is a link]()
[this is a link](<http://something.example.com/foo/bar>)
[this is a link](http://something.example.com/foo/bar 'test')
![this is an image]()
![this is an image](<http://something.example.com/foo/bar>)
![this is an image](http://something.example.com/foo/bar 'test')
[escape test](<\>\>\>\>\>\>\>\>\>\>\>\>\>\>> '\'\'\'\'\'\'\'\'\'\'\'\'\'\'')
[escape test \]\]\]\]\]\]\]\]\]\]\]\]\]\]\]\]](\)\)\)\)\)\)\)\)\)\)\)\)\)\))
Invalid links:
[this is not a link
[this is not a link](
[this is not a link](http://something.example.com/foo/bar 'test'
[this is not a link](((((((((((((((((((((((((((((((((((((((((((((((
[this is not a link]((((((((((()))))))))) (((((((((()))))))))))

View File

@ -0,0 +1,13 @@
Valid links:
[[[[[[[[](test)](test)](test)](test)](test)](test)](test)]
[ [[[[[[[[[[[[[[[[[[ [](test) ]]]]]]]]]]]]]]]]]] ](test)
Invalid links:
[[[[[[[[[
[ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [ [
![![![![![![![![![![![![![![![![![![![![![![![![![![![![![![![![![![![![![![

View File

@ -0,0 +1,23 @@
this\
should\
be\
separated\
by\
newlines
this
should
be
separated
by
newlines
too
this
should
not
be
separated
by
newlines

View File

@ -0,0 +1,13 @@
Lorem ipsum dolor sit amet, __consectetur__ adipiscing elit. Cras imperdiet nec erat ac condimentum. Nulla vel rutrum ligula. Sed hendrerit interdum orci a posuere. Vivamus ut velit aliquet, mollis purus eget, iaculis nisl. Proin posuere malesuada ante. Proin auctor orci eros, ac molestie lorem dictum nec. Vestibulum sit amet erat est. Morbi luctus sed elit ac luctus. Proin blandit, enim vitae egestas posuere, neque elit ultricies dui, vel mattis nibh enim ac lorem. Maecenas molestie nisl sit amet velit dictum lobortis. Aliquam erat volutpat.
Vivamus sagittis, diam in [vehicula](https://github.com/markdown-it/markdown-it) lobortis, sapien arcu mattis erat, vel aliquet sem urna et risus. Ut feugiat sapien vitae mi elementum laoreet. Suspendisse potenti. Aliquam erat nisl, aliquam pretium libero aliquet, sagittis eleifend nunc. In hac habitasse platea dictumst. Integer turpis augue, tincidunt dignissim mauris id, rhoncus dapibus purus. Maecenas et enim odio. Nullam massa metus, varius quis vehicula sed, pharetra mollis erat. In quis viverra velit. Vivamus placerat, est nec hendrerit varius, enim dui hendrerit magna, ut pulvinar nibh lorem vel lacus. Mauris a orci iaculis, hendrerit eros sed, gravida leo. In dictum mauris vel augue varius, ac ullamcorper nisl ornare. In eu posuere velit, ac fermentum arcu. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nullam sed malesuada leo, at interdum elit.
Nullam ut tincidunt nunc. [Pellentesque][1] metus lacus, commodo eget justo ut, rutrum varius nunc. Sed non rhoncus risus. Morbi sodales gravida pulvinar. Duis malesuada, odio volutpat elementum vulputate, massa magna scelerisque ante, et accumsan tellus nunc in sem. Donec mattis arcu et velit aliquet, non sagittis justo vestibulum. Suspendisse volutpat felis lectus, nec consequat ipsum mattis id. Donec dapibus vehicula facilisis. In tincidunt mi nisi, nec faucibus tortor euismod nec. Suspendisse ante ligula, aliquet vitae libero eu, vulputate dapibus libero. Sed bibendum, sapien at posuere interdum, libero est sollicitudin magna, ac gravida tellus purus eu ipsum. Proin ut quam arcu.
Suspendisse potenti. Donec ante velit, ornare at augue quis, tristique laoreet sem. Etiam in ipsum elit. Nullam cursus dolor sit amet nulla feugiat tristique. Phasellus ac tellus tincidunt, imperdiet purus eget, ullamcorper ipsum. Cras eu tincidunt sem. Nullam sed dapibus magna. Lorem ipsum dolor sit amet, consectetur adipiscing elit. In id venenatis tortor. In consectetur sollicitudin pharetra. Etiam convallis nisi nunc, et aliquam turpis viverra sit amet. Maecenas faucibus sodales tortor. Suspendisse lobortis mi eu leo viverra volutpat. Pellentesque velit ante, vehicula sodales congue ut, elementum a urna. Cras tempor, ipsum eget luctus rhoncus, arcu ligula fermentum urna, vulputate pharetra enim enim non libero.
Proin diam quam, elementum in eleifend id, elementum et metus. Cras in justo consequat justo semper ultrices. Sed dignissim lectus a ante mollis, nec vulputate ante molestie. Proin in porta nunc. Etiam pulvinar turpis sed velit porttitor, vel adipiscing velit fringilla. Cras ac tellus vitae purus pharetra tincidunt. Sed cursus aliquet aliquet. Cras eleifend commodo malesuada. In turpis turpis, ullamcorper ut tincidunt a, ullamcorper a nunc. Etiam luctus tellus ac dapibus gravida. Ut nec lacus laoreet neque ullamcorper volutpat.
Nunc et leo erat. Aenean mattis ultrices lorem, eget adipiscing dolor ultricies eu. In hac habitasse platea dictumst. Vivamus cursus feugiat sapien quis aliquam. Mauris quam libero, porta vel volutpat ut, blandit a purus. Vivamus vestibulum dui vel tortor molestie, sit amet feugiat sem commodo. Nulla facilisi. Sed molestie arcu eget tellus vestibulum tristique.
[1]: https://github.com/markdown-it

View File

@ -0,0 +1,17 @@
this is a test for tab expansion, be careful not to replace them with spaces
1 4444
22 333
333 22
4444 1
tab-indented line
space-indented line
tab-indented line
a lot of spaces in between here
a lot of tabs in between here

9710
benchmarking/samples/spec.md Normal file

File diff suppressed because it is too large Load Diff

10
codecov.yml Normal file
View File

@ -0,0 +1,10 @@
coverage:
status:
project:
default:
target: 95%
threshold: 0.2%
patch:
default:
target: 80%
threshold: 0.2%

1
docs/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.ipynb

28
docs/Makefile Normal file
View File

@ -0,0 +1,28 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# raise warnings to errors
html-strict:
@$(SPHINXBUILD) -b html -nW --keep-going "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)
# increase logging level to verbose
html-verbose:
@$(SPHINXBUILD) -b html -v "$(SOURCEDIR)" "$(BUILDDIR)/html" $(SPHINXOPTS) $(O)

5
docs/_static/custom.css vendored Normal file
View File

@ -0,0 +1,5 @@
.code-cell > .highlight > pre {
border-left-color: green;
border-left-width: medium;
border-left-style: solid;
}

176
docs/architecture.md Normal file
View File

@ -0,0 +1,176 @@
(md/architecture)=
# markdown-it design principles
(md/data-flow)=
## Data flow
Input data is parsed via nested chains of rules. There are 3 nested chains -
`core`, `block` & `inline`:
```
core
core.rule1 (normalize)
...
core.ruleX
block
block.rule1 (blockquote)
...
block.ruleX
core.ruleX1 (intermediate rule that applies on block tokens, nothing yet)
...
core.ruleXX
inline (applied to each block token with "inline" type)
inline.rule1 (text)
...
inline.ruleX
core.ruleYY (applies to all tokens)
... (abbreviation, footnote, typographer, linkifier)
```
The result of the parsing is a *list of tokens*, that will be passed to the `renderer` to generate the html content.
These tokens can be themselves parsed again to generate more tokens (ex: a `list token` can be divided into multiple `inline tokens`).
An `env` sandbox can be used alongside tokens to inject external variables for your parsers and renderers.
Each chain (core / block / inline) uses an independent `state` object when parsing data, so that each parsing operation is independent and can be disabled on the fly.
## Token stream
Instead of traditional AST we use more low-level data representation - *tokens*.
The difference is simple:
- Tokens are a simple sequence (Array).
- Opening and closing tags are separate.
- There are special token objects, "inline containers", having nested tokens.
sequences with inline markup (bold, italic, text, ...).
See [token class](https://github.com/executablebooks/markdown-it-py/tree/master/markdown_it/token.py)
for details about each token content.
In total, a token stream is:
- On the top level - array of paired or single "block" tokens:
- open/close for headers, lists, blockquotes, paragraphs, ...
- codes, fenced blocks, horizontal rules, html blocks, inlines containers
- Each inline token have a `.children` property with a nested token stream for inline content:
- open/close for strong, em, link, code, ...
- text, line breaks
Why not AST? Because it's not needed for our tasks. We follow KISS principle.
If you wish - you can call a parser without a renderer and convert the token stream
to an AST.
More details about tokens:
- [Renderer source](https://github.com/executablebooks/markdown-it-py/tree/master/markdown_it/renderer.py)
- [Token source](https://github.com/executablebooks/markdown-it-py/tree/master/markdown_it/token.py)
- [Live demo](https://markdown-it.github.io/) - type your text and click `debug` tab.
## Rules
Rules are functions, doing "magic" with parser `state` objects. A rule is associated with one or more *chains* and is unique. For instance, a `blockquote` token is associated with `blockquote`, `paragraph`, `heading` and `list` chains.
Rules are managed by names via [Ruler](https://markdown-it.github.io/markdown-it/#Ruler) instances and can be `enabled` / `disabled` from the [MarkdownIt](https://markdown-it.github.io/markdown-it/#MarkdownIt) methods.
You can note, that some rules have a `validation mode` - in this mode rules do not
modify the token stream, and only look ahead for the end of a token. It's one
important design principle - a token stream is "write only" on block & inline parse stages.
Parsers are designed to keep rules independent of each other. You can safely enable/disable them, or
add new ones. There are no universal recipes for how to create new rules - design of
distributed state machines with good data isolation is a tricky business. But you
can investigate existing rules & plugins to see possible approaches.
Also, in complex cases you can try to ask for help in tracker. Condition is very
simple - it should be clear from your ticket, that you studied docs, sources,
and tried to do something yourself. We never reject with help to real developers.
## Renderer
After the token stream is generated, it's passed to a [renderer](https://github.com/executablebooks/markdown-it-py/tree/master/markdown_it/renderer.py).
It then plays all the tokens, passing each to a rule with the same name as token type.
Renderer rules are located in `md.renderer.rules[name]` and are simple functions
with the same signature:
```python
def function(renderer, tokens, idx, options, env):
return htmlResult
```
In many cases that allows easy output change even without parser intrusion.
For example, let's replace images with vimeo links to player's iframe:
```python
import re
md = MarkdownIt("commonmark")
vimeoRE = re.compile(r'^https?:\/\/(www\.)?vimeo.com\/(\d+)($|\/)')
def render_vimeo(self, tokens, idx, options, env):
token = tokens[idx]
if vimeoRE.match(token.attrs["src"]):
ident = vimeoRE.match(token.attrs["src"])[2]
return ('<div class="embed-responsive embed-responsive-16by9">\n' +
' <iframe class="embed-responsive-item" src="//player.vimeo.com/video/' +
ident + '"></iframe>\n' +
'</div>\n')
return self.image(tokens, idx, options, env)
md = MarkdownIt("commonmark")
md.add_render_rule("image", render_vimeo)
print(md.render("![](https://www.vimeo.com/123)"))
```
Here is another example, how to add `target="_blank"` to all links:
```python
from markdown_it import MarkdownIt
def render_blank_link(self, tokens, idx, options, env):
tokens[idx].attrSet("target", "_blank")
# pass token to default renderer.
return self.renderToken(tokens, idx, options, env)
md = MarkdownIt("commonmark")
md.add_render_rule("link_open", render_blank_link)
print(md.render("[a]\n\n[a]: b"))
```
Note, if you need to add attributes, you can do things without renderer override.
For example, you can update tokens in `core` chain. That is slower, than direct
renderer override, but can be more simple.
You also can write your own renderer to generate other formats than HTML, such as
JSON/XML... You can even use it to generate AST.
## Summary
This was mentioned in [Data flow](md/data-flow), but let's repeat sequence again:
1. Blocks are parsed, and top level of token stream filled with block tokens.
2. Content on inline containers is parsed, filling `.children` properties.
3. Rendering happens.
And somewhere between you can apply additional transformations :) . Full content
of each chain can be seen on the top of
[parser_core.py](https://github.com/executablebooks/markdown-it-py/tree/master/markdown_it/parser_core.py),
[parser_block.py](https://github.com/executablebooks/markdown-it-py/tree/master/markdown_it/parser_block.py) and
[parser_inline.py](https://github.com/executablebooks/markdown-it-py/tree/master/markdown_it/parser_inline.py)
files.
Also you can change output directly in [renderer](https://github.com/executablebooks/markdown-it-py/tree/master/markdown_it/renderer.py) for many simple cases.

150
docs/conf.py Normal file
View File

@ -0,0 +1,150 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
from glob import glob
import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = "markdown-it-py"
copyright = "2020, executable book project"
author = "executable book project"
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.viewcode",
"sphinx.ext.intersphinx",
"myst_parser",
"sphinx_copybutton",
"sphinx_design",
]
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
nitpicky = True
nitpick_ignore = [
("py:class", "Match"),
("py:class", "Path"),
("py:class", "x in the interval [0, 1)."),
("py:class", "markdown_it.helpers.parse_link_destination._Result"),
("py:class", "markdown_it.helpers.parse_link_title._Result"),
("py:class", "MarkdownIt"),
("py:class", "RuleFunc"),
("py:class", "_NodeType"),
("py:class", "typing_extensions.Protocol"),
]
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_title = "markdown-it-py"
html_theme = "sphinx_book_theme"
html_theme_options = {
"use_edit_page_button": True,
"repository_url": "https://github.com/executablebooks/markdown-it-py",
"repository_branch": "master",
"path_to_docs": "docs",
}
html_static_path = ["_static"]
html_css_files = ["custom.css"]
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ["_static"]
intersphinx_mapping = {
"python": ("https://docs.python.org/3.7", None),
"mdit-py-plugins": ("https://mdit-py-plugins.readthedocs.io/en/latest/", None),
}
def run_apidoc(app):
"""generate apidoc
See: https://github.com/rtfd/readthedocs.org/issues/1139
"""
import os
import shutil
import sphinx
from sphinx.ext import apidoc
logger = sphinx.util.logging.getLogger(__name__)
logger.info("running apidoc")
# get correct paths
this_folder = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
api_folder = os.path.join(this_folder, "api")
module_path = os.path.normpath(os.path.join(this_folder, "../"))
ignore_paths = ["../profiler.py", "../conftest.py", "../tests", "../benchmarking"]
ignore_paths = [
os.path.normpath(os.path.join(this_folder, p)) for p in ignore_paths
]
# functions from these modules are all imported in the __init__.py with __all__
for rule in ("block", "core", "inline"):
for path in glob(
os.path.normpath(
os.path.join(this_folder, f"../markdown_it/rules_{rule}/*.py")
)
):
if os.path.basename(path) not in ("__init__.py", f"state_{rule}.py"):
ignore_paths.append(path)
if os.path.exists(api_folder):
shutil.rmtree(api_folder)
os.mkdir(api_folder)
argv = ["-M", "--separate", "-o", api_folder, module_path] + ignore_paths
apidoc.OPTIONS.append("ignore-module-all")
apidoc.main(argv)
# we don't use this
if os.path.exists(os.path.join(api_folder, "modules.rst")):
os.remove(os.path.join(api_folder, "modules.rst"))
def setup(app):
"""Add functions to the Sphinx setup."""
if os.environ.get("SKIP_APIDOC", None) is None:
app.connect("builder-inited", run_apidoc)
from sphinx.directives.code import CodeBlock
class CodeCell(CodeBlock):
"""Custom code block directive."""
def run(self):
"""Run the directive."""
self.options["class"] = ["code-cell"]
return super().run()
# note, these could be run by myst-nb,
# but currently this causes a circular dependency issue
app.add_directive("code-cell", CodeCell)

108
docs/contributing.md Normal file
View File

@ -0,0 +1,108 @@
# Contribute to markdown-it-py
We welcome all contributions! ✨
See the [EBP Contributing Guide](https://executablebooks.org/en/latest/contributing.html) for general details, and below for guidance specific to markdown-it-py.
Before continuing, make sure you've read:
1. [Architecture description](md/architecture)
2. [Security considerations](md/security)
3. [API documentation](api/markdown_it)
## Development guidance
Details of the port can be found in the `markdown_it/port.yaml` and in `port.yaml` files, within the extension folders.
## Code Style
Code style is tested using [flake8](http://flake8.pycqa.org), with the configuration set in `.flake8`, and code formatted with [black](https://github.com/ambv/black).
Installing with `markdown-it-py[code_style]` makes the [pre-commit](https://pre-commit.com/) package available, which will ensure this style is met before commits are submitted, by reformatting the code and testing for lint errors.
It can be setup by:
```shell
>> cd markdown-it-py
>> pre-commit install
```
Editors like VS Code also have automatic code reformat utilities, which can adhere to this standard.
All functions and class methods should be annotated with types and include a docstring.
## Testing
For code tests, markdown-it-py uses [pytest](https://docs.pytest.org)):
```shell
>> cd markdown-it-py
>> pytest
```
You can also use [tox](https://tox.readthedocs.io), to run the tests in multiple isolated environments (see the `tox.ini` file for available test environments):
```shell
>> cd markdown-it-py
>> tox -p
```
This can also be used to run benchmarking tests using [pytest-benchmark](https://pytest-benchmark.readthedocs.io):
```shell
>> cd markdown-it-py
tox -e py38-bench-packages -- --benchmark-min-rounds 50
```
For documentation build tests:
```shell
>> cd markdown-it-py/docs
>> make clean
>> make html-strict
```
## Contributing a plugin
1. Does it already exist as JavaScript implementation ([see npm](https://www.npmjs.com/search?q=keywords:markdown-it-plugin))?
Where possible try to port directly from that.
It is usually better to modify existing code, instead of writing all from scratch.
2. Try to find the right place for your plugin rule:
- Will it conflict with existing markup (by priority)?
- If yes - you need to write an inline or block rule.
- If no - you can morph tokens within core chains.
- Remember that token morphing in core chains is always more simple than writing
block or inline rules, if you don't copy existing ones. However,
block and inline rules are usually faster.
- Sometimes, it's enough to only modify the renderer, for example, to add
header IDs or `target="_blank"` for the links.
## FAQ
### I need async rule, how to do it?
Sorry. You can't do it directly. All complex parsers are sync by nature. But you
can use workarounds:
1. On parse phase, replace content by random number and store it in `env`.
2. Do async processing over collected data.
3. Render content and replace those random numbers with text; or replace first, then render.
Alternatively, you can render HTML, then parse it to DOM, or
[cheerio](https://github.com/cheeriojs/cheerio) AST, and apply transformations
in a more convenient way.
### How to replace part of text token with link?
The right sequence is to split text to several tokens and add link tokens in between.
The result will be: `text` + `link_open` + `text` + `link_close` + `text`.
See implementations of [linkify](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_core/linkify.js) and [emoji](https://github.com/markdown-it/markdown-it-emoji/blob/master/lib/replace.js) - those do text token splits.
__Note:__ Don't try to replace text with HTML markup! That's not secure.
### Why is my inline rule not executed?
The inline parser skips pieces of texts to optimize speed. It stops only on [a small set of chars](https://github.com/markdown-it/markdown-it/blob/master/lib/rules_inline/text.js), which can be tokens. We did not made this list extensible for performance reasons too.
If you are absolutely sure that something important is missing there - create a
ticket and we will consider adding it as a new charcode.

41
docs/index.md Normal file
View File

@ -0,0 +1,41 @@
# markdown-it-py
> Markdown parser done right.
- {fa}`check,text-success mr-1` Follows the __[CommonMark spec](http://spec.commonmark.org/)__ for baseline parsing
- {fa}`check,text-success mr-1` Configurable syntax: you can add new rules and even replace existing ones.
- {fa}`check,text-success mr-1` Pluggable: Adds syntax extensions to extend the parser (see the [plugin list](md/plugins))
- {fa}`check,text-success mr-1` High speed (see our [benchmarking tests](md/performance))
- {fa}`check,text-success mr-1` [Safe by default](md/security)
For a good introduction to [markdown-it] see the __[Live demo](https://markdown-it.github.io)__.
This is a Python port of the well used [markdown-it], and some of its associated plugins.
The driving design philosophy of the port has been to change as little of the fundamental code structure (file names, function name, etc) as possible, just sprinkling in a little Python syntactical sugar ✨.
It is very simple to write complimentary extensions for both language implementations!
## References & Thanks
Big thanks to the authors of [markdown-it]
- Alex Kocharin [github/rlidwka](https://github.com/rlidwka)
- Vitaly Puzrin [github/puzrin](https://github.com/puzrin)
Also [John MacFarlane](https://github.com/jgm) for his work on the CommonMark spec and reference implementations.
## Related Links
- <https://github.com/jgm/CommonMark> - reference CommonMark implementations in C & JS, also contains latest spec & online demo.
- <http://talk.commonmark.org> - CommonMark forum, good place to collaborate developers' efforts.
```{toctree}
:maxdepth: 2
using
architecture
other
plugins
contributing
api/markdown_it
```
[markdown-it]: https://github.com/markdown-it/markdown-it

66
docs/other.md Normal file
View File

@ -0,0 +1,66 @@
(md/security)=
# Security
Many people don't understand that markdown format does not care much about security.
In many cases you have to pass output to sanitizers.
`markdown-it` provides 2 possible strategies to produce safe output:
1. Don't enable HTML. Extend markup features with [plugins](md/plugins).
We think it's the best choice and use it by default.
- That's ok for 99% of user needs.
- Output will be safe without sanitizer.
2. Enable HTML and use external sanitizer package(s).
Also by default `markdown-it` prohibits some kind of links, which could be used
for XSS:
- `javascript:`, `vbscript:`
- `file:`
- `data:`, except some images (gif/png/jpeg/webp).
So, by default `markdown-it` should be safe. We care about it.
If you find a security problem - contact us via tracker or email.
Such reports are fixed with top priority.
## Plugins
Usually, plugins operate with tokenized content, and that's enough to provide safe output.
But there is one non-evident case you should know - don't allow plugins to generate arbitrary element `id` and `name`.
If those depend on user input - always add prefixes to avoid DOM clobbering.
See [discussion](https://github.com/markdown-it/markdown-it/issues/28) for details.
So, if you decide to use plugins that add extended class syntax or autogenerating header anchors - be careful.
(md/performance)=
# Performance
You can view our continuous integration benchmarking analysis at: <https://executablebooks.github.io/markdown-it-py/dev/bench/>,
or you can run it for yourself within the repository:
```console
$ tox -e py38-bench-packages -- --benchmark-columns mean,stddev
Name (time in ms) Mean StdDev
---------------------------------------------------------------
test_mistune 70.3272 (1.0) 0.7978 (1.0)
test_mistletoe 116.0919 (1.65) 6.2870 (7.88)
test_markdown_it_py 152.9022 (2.17) 4.2988 (5.39)
test_commonmark_py 326.9506 (4.65) 15.8084 (19.81)
test_pymarkdown 368.2712 (5.24) 7.5906 (9.51)
test_pymarkdown_extra 640.4913 (9.11) 15.1769 (19.02)
test_panflute 678.3547 (9.65) 9.4622 (11.86)
---------------------------------------------------------------
```
As you can see, `markdown-it-py` doesn't pay with speed for it's flexibility.
```{note}
`mistune` is not CommonMark compliant, which is what allows for its
faster parsing, at the expense of issues, for example, with nested inline parsing.
See [mistletoes's explanation](https://github.com/miyuchina/mistletoe/blob/master/performance.md)
for further details.
```

50
docs/plugins.md Normal file
View File

@ -0,0 +1,50 @@
(md/plugins)=
# Plugin extensions
The following plugins are embedded within the core package:
- [tables](https://help.github.com/articles/organizing-information-with-tables/) (GFM)
- [strikethrough](https://help.github.com/articles/basic-writing-and-formatting-syntax/#styling-text) (GFM)
These can be enabled individually:
```python
from markdown_it import MarkdownIt
md = MarkdownIt("commonmark").enable('table')
```
or as part of a configuration:
```python
from markdown_it import MarkdownIt
md = MarkdownIt("gfm-like")
```
```{seealso}
See [](using.md)
```
Many other plugins are then available *via* the `mdit-py-plugins` package, including:
- Front-matter
- Footnotes
- Definition lists
- Task lists
- Heading anchors
- LaTeX math
- Containers
- Word count
For full information see: <https://mdit-py-plugins.readthedocs.io>
Or you can write them yourself!
They can be chained and loaded *via*:
```python
from markdown_it import MarkdownIt
from mdit_py_plugins import plugin1, plugin2
md = MarkdownIt().use(plugin1, keyword=value).use(plugin2, keyword=value)
html_string = md.render("some *Markdown*")
```

399
docs/using.md Normal file
View File

@ -0,0 +1,399 @@
---
jupytext:
formats: ipynb,md:myst
text_representation:
extension: .md
format_name: myst
format_version: '0.8'
jupytext_version: 1.4.2
kernelspec:
display_name: Python 3
language: python
name: python3
---
# Using `markdown_it`
> This document can be opened to execute with [Jupytext](https://jupytext.readthedocs.io)!
markdown-it-py may be used as an API *via* the [`markdown-it-py`](https://pypi.org/project/markdown-it-py/) package.
The raw text is first parsed to syntax 'tokens',
then these are converted to other formats using 'renderers'.
+++
## Quick-Start
The simplest way to understand how text will be parsed is using:
```{code-cell} python
from pprint import pprint
from markdown_it import MarkdownIt
```
```{code-cell} python
md = MarkdownIt()
md.render("some *text*")
```
```{code-cell} python
for token in md.parse("some *text*"):
print(token)
print()
```
## The Parser
+++
The `MarkdownIt` class is instantiated with parsing configuration options,
dictating the syntax rules and additional options for the parser and renderer.
You can define this configuration *via* directly supplying a dictionary or a preset name:
- `zero`: This configures the minimum components to parse text (i.e. just paragraphs and text)
- `commonmark` (default): This configures the parser to strictly comply with the [CommonMark specification](http://spec.commonmark.org/).
- `js-default`: This is the default in the JavaScript version.
Compared to `commonmark`, it disables HTML parsing and enables the table and strikethrough components.
- `gfm-like`: This configures the parser to approximately comply with the [GitHub Flavored Markdown specification](https://github.github.com/gfm/).
Compared to `commonmark`, it enables the table, strikethrough and linkify components.
**Important**, to use this configuration you must have `linkify-it-py` installed.
```{code-cell} python
from markdown_it.presets import zero
zero.make()
```
```{code-cell} python
md = MarkdownIt("zero")
md.options
```
You can also override specific options:
```{code-cell} python
md = MarkdownIt("zero", {"maxNesting": 99})
md.options
```
```{code-cell} python
pprint(md.get_active_rules())
```
You can find all the parsing rules in the source code:
`parser_core.py`, `parser_block.py`,
`parser_inline.py`.
```{code-cell} python
pprint(md.get_all_rules())
```
Any of the parsing rules can be enabled/disabled, and these methods are "chainable":
```{code-cell} python
md.render("- __*emphasise this*__")
```
```{code-cell} python
md.enable(["list", "emphasis"]).render("- __*emphasise this*__")
```
You can temporarily modify rules with the `reset_rules` context manager.
```{code-cell} python
with md.reset_rules():
md.disable("emphasis")
print(md.render("__*emphasise this*__"))
md.render("__*emphasise this*__")
```
Additionally `renderInline` runs the parser with all block syntax rules disabled.
```{code-cell} python
md.renderInline("__*emphasise this*__")
```
### Typographic components
The `smartquotes` and `replacements` components are intended to improve typography:
`smartquotes` will convert basic quote marks to their opening and closing variants:
- 'single quotes' -> single quotes
- "double quotes" -> “double quotes”
`replacements` will replace particular text constructs:
- ``(c)``, ``(C)`` → ©
- ``(tm)``, ``(TM)`` → ™
- ``(r)``, ``(R)`` → ®
- ``(p)``, ``(P)`` → §
- ``+-`` → ±
- ``...`` → …
- ``?....`` → ?..
- ``!....`` → !..
- ``????????`` → ???
- ``!!!!!`` → !!!
- ``,,,`` → ,
- ``--`` → &ndash
- ``---`` → &mdash
Both of these components require typography to be turned on, as well as the components enabled:
```{code-cell} python
md = MarkdownIt("commonmark", {"typographer": True})
md.enable(["replacements", "smartquotes"])
md.render("'single quotes' (c)")
```
### Linkify
The `linkify` component requires that [linkify-it-py](https://github.com/tsutsu3/linkify-it-py) be installed (e.g. *via* `pip install markdown-it-py[linkify]`).
This allows URI autolinks to be identified, without the need for enclosing in `<>` brackets:
```{code-cell} python
md = MarkdownIt("commonmark", {"linkify": True})
md.enable(["linkify"])
md.render("github.com")
```
### Plugins load
Plugins load collections of additional syntax rules and render methods into the parser.
A number of useful plugins are available in [`mdit_py_plugins`](https://github.com/executablebooks/mdit-py-plugins) (see [the plugin list](./plugins.md)),
or you can create your own (following the [markdown-it design principles](./architecture.md)).
```{code-cell} python
from markdown_it import MarkdownIt
import mdit_py_plugins
from mdit_py_plugins.front_matter import front_matter_plugin
from mdit_py_plugins.footnote import footnote_plugin
md = (
MarkdownIt()
.use(front_matter_plugin)
.use(footnote_plugin)
.enable('table')
)
text = ("""
---
a: 1
---
a | b
- | -
1 | 2
A footnote [^1]
[^1]: some details
""")
md.render(text)
```
## The Token Stream
+++
Before rendering, the text is parsed to a flat token stream of block level syntax elements, with nesting defined by opening (1) and closing (-1) attributes:
```{code-cell} python
md = MarkdownIt("commonmark")
tokens = md.parse("""
Here's some *text*
1. a list
> a *quote*""")
[(t.type, t.nesting) for t in tokens]
```
Naturally all openings should eventually be closed,
such that:
```{code-cell} python
sum([t.nesting for t in tokens]) == 0
```
All tokens are the same class, which can also be created outside the parser:
```{code-cell} python
tokens[0]
```
```{code-cell} python
from markdown_it.token import Token
token = Token("paragraph_open", "p", 1, block=True, map=[1, 2])
token == tokens[0]
```
The `'inline'` type token contain the inline tokens as children:
```{code-cell} python
tokens[1]
```
You can serialize a token (and its children) to a JSONable dictionary using:
```{code-cell} python
print(tokens[1].as_dict())
```
This dictionary can also be deserialized:
```{code-cell} python
Token.from_dict(tokens[1].as_dict())
```
### Creating a syntax tree
```{versionchanged} 0.7.0
`nest_tokens` and `NestedTokens` are deprecated and replaced by `SyntaxTreeNode`.
```
In some use cases it may be useful to convert the token stream into a syntax tree,
with opening/closing tokens collapsed into a single token that contains children.
```{code-cell} python
from markdown_it.tree import SyntaxTreeNode
md = MarkdownIt("commonmark")
tokens = md.parse("""
# Header
Here's some text and an image ![title](image.png)
1. a **list**
> a *quote*
""")
node = SyntaxTreeNode(tokens)
print(node.pretty(indent=2, show_text=True))
```
You can then use methods to traverse the tree
```{code-cell} python
node.children
```
```{code-cell} python
print(node[0])
node[0].next_sibling
```
## Renderers
+++
After the token stream is generated, it's passed to a [renderer](https://github.com/executablebooks/markdown-it-py/tree/master/markdown_it/renderer.py).
It then plays all the tokens, passing each to a rule with the same name as token type.
Renderer rules are located in `md.renderer.rules` and are simple functions
with the same signature:
```python
def function(renderer, tokens, idx, options, env):
return htmlResult
```
+++
You can inject render methods into the instantiated render class.
```{code-cell} python
md = MarkdownIt("commonmark")
def render_em_open(self, tokens, idx, options, env):
return '<em class="myclass">'
md.add_render_rule("em_open", render_em_open)
md.render("*a*")
```
This is a slight change to the JS version, where the renderer argument is at the end.
Also `add_render_rule` method is specific to Python, rather than adding directly to the `md.renderer.rules`, this ensures the method is bound to the renderer.
+++
You can also subclass a render and add the method there:
```{code-cell} python
from markdown_it.renderer import RendererHTML
class MyRenderer(RendererHTML):
def em_open(self, tokens, idx, options, env):
return '<em class="myclass">'
md = MarkdownIt("commonmark", renderer_cls=MyRenderer)
md.render("*a*")
```
Plugins can support multiple render types, using the `__ouput__` attribute (this is currently a Python only feature).
```{code-cell} python
from markdown_it.renderer import RendererHTML
class MyRenderer1(RendererHTML):
__output__ = "html1"
class MyRenderer2(RendererHTML):
__output__ = "html2"
def plugin(md):
def render_em_open1(self, tokens, idx, options, env):
return '<em class="myclass1">'
def render_em_open2(self, tokens, idx, options, env):
return '<em class="myclass2">'
md.add_render_rule("em_open", render_em_open1, fmt="html1")
md.add_render_rule("em_open", render_em_open2, fmt="html2")
md = MarkdownIt("commonmark", renderer_cls=MyRenderer1).use(plugin)
print(md.render("*a*"))
md = MarkdownIt("commonmark", renderer_cls=MyRenderer2).use(plugin)
print(md.render("*a*"))
```
Here's a more concrete example; let's replace images with vimeo links to player's iframe:
```{code-cell} python
import re
from markdown_it import MarkdownIt
vimeoRE = re.compile(r'^https?:\/\/(www\.)?vimeo.com\/(\d+)($|\/)')
def render_vimeo(self, tokens, idx, options, env):
token = tokens[idx]
if vimeoRE.match(token.attrs["src"]):
ident = vimeoRE.match(token.attrs["src"])[2]
return ('<div class="embed-responsive embed-responsive-16by9">\n' +
' <iframe class="embed-responsive-item" src="//player.vimeo.com/video/' +
ident + '"></iframe>\n' +
'</div>\n')
return self.image(tokens, idx, options, env)
md = MarkdownIt("commonmark")
md.add_render_rule("image", render_vimeo)
print(md.render("![](https://www.vimeo.com/123)"))
```
Here is another example, how to add `target="_blank"` to all links:
```{code-cell} python
from markdown_it import MarkdownIt
def render_blank_link(self, tokens, idx, options, env):
tokens[idx].attrSet("target", "_blank")
# pass token to default renderer.
return self.renderToken(tokens, idx, options, env)
md = MarkdownIt("commonmark")
md.add_render_rule("link_open", render_blank_link)
print(md.render("[a]\n\n[a]: b"))
```

5
markdown_it/__init__.py Normal file
View File

@ -0,0 +1,5 @@
"""A Python port of Markdown-It"""
__all__ = ("MarkdownIt",)
__version__ = "2.1.0"
from .main import MarkdownIt

10
markdown_it/_compat.py Normal file
View File

@ -0,0 +1,10 @@
from __future__ import annotations
from collections.abc import Mapping
import sys
from typing import Any
if sys.version_info >= (3, 10):
DATACLASS_KWARGS: Mapping[str, Any] = {"slots": True}
else:
DATACLASS_KWARGS: Mapping[str, Any] = {}

66
markdown_it/_punycode.py Normal file
View File

@ -0,0 +1,66 @@
# Copyright 2014 Mathias Bynens <https://mathiasbynens.be/>
# Copyright 2021 Taneli Hukkinen
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import codecs
import re
REGEX_SEPARATORS = re.compile(r"[\x2E\u3002\uFF0E\uFF61]")
REGEX_NON_ASCII = re.compile(r"[^\0-\x7E]")
def encode(uni: str) -> str:
return codecs.encode(uni, encoding="punycode").decode()
def decode(ascii: str) -> str:
return codecs.decode(ascii, encoding="punycode") # type: ignore[call-overload]
def map_domain(string, fn):
parts = string.split("@")
result = ""
if len(parts) > 1:
# In email addresses, only the domain name should be punycoded. Leave
# the local part (i.e. everything up to `@`) intact.
result = parts[0] + "@"
string = parts[1]
labels = REGEX_SEPARATORS.split(string)
encoded = ".".join(fn(label) for label in labels)
return result + encoded
def to_unicode(obj: str) -> str:
def mapping(obj: str) -> str:
if obj.startswith("xn--"):
return decode(obj[4:].lower())
return obj
return map_domain(obj, mapping)
def to_ascii(obj: str) -> str:
def mapping(obj: str) -> str:
if REGEX_NON_ASCII.search(obj):
return "xn--" + encode(obj)
return obj
return map_domain(obj, mapping)

View File

109
markdown_it/cli/parse.py Normal file
View File

@ -0,0 +1,109 @@
#!/usr/bin/env python
"""
CLI interface to markdown-it-py
Parse one or more markdown files, convert each to HTML, and print to stdout.
"""
from __future__ import annotations
import argparse
from collections.abc import Iterable, Sequence
import sys
from markdown_it import __version__
from markdown_it.main import MarkdownIt
version_str = "markdown-it-py [version {}]".format(__version__)
def main(args: Sequence[str] | None = None) -> int:
namespace = parse_args(args)
if namespace.filenames:
convert(namespace.filenames)
else:
interactive()
return 0
def convert(filenames: Iterable[str]) -> None:
for filename in filenames:
convert_file(filename)
def convert_file(filename: str) -> None:
"""
Parse a Markdown file and dump the output to stdout.
"""
try:
with open(filename, "r") as fin:
rendered = MarkdownIt().render(fin.read())
print(rendered, end="")
except OSError:
sys.stderr.write(f'Cannot open file "{filename}".\n')
sys.exit(1)
def interactive() -> None:
"""
Parse user input, dump to stdout, rinse and repeat.
Python REPL style.
"""
print_heading()
contents = []
more = False
while True:
try:
prompt, more = ("... ", True) if more else (">>> ", True)
contents.append(input(prompt) + "\n")
except EOFError:
print("\n" + MarkdownIt().render("\n".join(contents)), end="")
more = False
contents = []
except KeyboardInterrupt:
print("\nExiting.")
break
def parse_args(args: Sequence[str] | None) -> argparse.Namespace:
"""Parse input CLI arguments."""
parser = argparse.ArgumentParser(
description="Parse one or more markdown files, "
"convert each to HTML, and print to stdout",
# NOTE: Remember to update README.md w/ the output of `markdown-it -h`
epilog=(
f"""
Interactive:
$ markdown-it
markdown-it-py [version {__version__}] (interactive)
Type Ctrl-D to complete input, or Ctrl-C to exit.
>>> # Example
... > markdown *input*
...
<h1>Example</h1>
<blockquote>
<p>markdown <em>input</em></p>
</blockquote>
Batch:
$ markdown-it README.md README.footer.md > index.html
"""
),
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("-v", "--version", action="version", version=version_str)
parser.add_argument(
"filenames", nargs="*", help="specify an optional list of files to convert"
)
return parser.parse_args(args)
def print_heading() -> None:
print("{} (interactive)".format(version_str))
print("Type Ctrl-D to complete input, or Ctrl-C to exit.")
if __name__ == "__main__":
exit_code = main(sys.argv[1:])
sys.exit(exit_code)

View File

View File

@ -0,0 +1,4 @@
"""HTML5 entities map: { name -> characters }."""
import html.entities
entities = {name.rstrip(";"): chars for name, chars in html.entities.html5.items()}

View File

@ -0,0 +1,68 @@
"""List of valid html blocks names, according to commonmark spec
http://jgm.github.io/CommonMark/spec.html#html-blocks
"""
block_names = [
"address",
"article",
"aside",
"base",
"basefont",
"blockquote",
"body",
"caption",
"center",
"col",
"colgroup",
"dd",
"details",
"dialog",
"dir",
"div",
"dl",
"dt",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"hr",
"html",
"iframe",
"legend",
"li",
"link",
"main",
"menu",
"menuitem",
"nav",
"noframes",
"ol",
"optgroup",
"option",
"p",
"param",
"section",
"source",
"summary",
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"title",
"tr",
"track",
"ul",
]

View File

@ -0,0 +1,40 @@
"""Regexps to match html elements
"""
import re
attr_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
unquoted = "[^\"'=<>`\\x00-\\x20]+"
single_quoted = "'[^']*'"
double_quoted = '"[^"]*"'
attr_value = "(?:" + unquoted + "|" + single_quoted + "|" + double_quoted + ")"
attribute = "(?:\\s+" + attr_name + "(?:\\s*=\\s*" + attr_value + ")?)"
open_tag = "<[A-Za-z][A-Za-z0-9\\-]*" + attribute + "*\\s*\\/?>"
close_tag = "<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>"
comment = "<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->"
processing = "<[?][\\s\\S]*?[?]>"
declaration = "<![A-Z]+\\s+[^>]*>"
cdata = "<!\\[CDATA\\[[\\s\\S]*?\\]\\]>"
HTML_TAG_RE = re.compile(
"^(?:"
+ open_tag
+ "|"
+ close_tag
+ "|"
+ comment
+ "|"
+ processing
+ "|"
+ declaration
+ "|"
+ cdata
+ ")"
)
HTML_OPEN_CLOSE_TAG_STR = "^(?:" + open_tag + "|" + close_tag + ")"
HTML_OPEN_CLOSE_TAG_RE = re.compile(HTML_OPEN_CLOSE_TAG_STR)

View File

@ -0,0 +1,82 @@
from __future__ import annotations
from collections.abc import Callable
import re
from urllib.parse import quote, unquote, urlparse, urlunparse # noqa: F401
import mdurl
from .. import _punycode
RECODE_HOSTNAME_FOR = ("http:", "https:", "mailto:")
def normalizeLink(url: str) -> str:
"""Normalize destination URLs in links
::
[label]: destination 'title'
^^^^^^^^^^^
"""
parsed = mdurl.parse(url, slashes_denote_host=True)
if parsed.hostname:
# Encode hostnames in urls like:
# `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
#
# We don't encode unknown schemas, because it's likely that we encode
# something we shouldn't (e.g. `skype:name` treated as `skype:host`)
#
if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR:
try:
parsed = parsed._replace(hostname=_punycode.to_ascii(parsed.hostname))
except Exception:
pass
return mdurl.encode(mdurl.format(parsed))
def normalizeLinkText(url: str) -> str:
"""Normalize autolink content
::
<destination>
~~~~~~~~~~~
"""
parsed = mdurl.parse(url, slashes_denote_host=True)
if parsed.hostname:
# Encode hostnames in urls like:
# `http://host/`, `https://host/`, `mailto:user@host`, `//host/`
#
# We don't encode unknown schemas, because it's likely that we encode
# something we shouldn't (e.g. `skype:name` treated as `skype:host`)
#
if not parsed.protocol or parsed.protocol in RECODE_HOSTNAME_FOR:
try:
parsed = parsed._replace(hostname=_punycode.to_unicode(parsed.hostname))
except Exception:
pass
# add '%' to exclude list because of https://github.com/markdown-it/markdown-it/issues/720
return mdurl.decode(mdurl.format(parsed), mdurl.DECODE_DEFAULT_CHARS + "%")
BAD_PROTO_RE = re.compile(r"^(vbscript|javascript|file|data):")
GOOD_DATA_RE = re.compile(r"^data:image\/(gif|png|jpeg|webp);")
def validateLink(url: str, validator: Callable | None = None) -> bool:
"""Validate URL link is allowed in output.
This validator can prohibit more than really needed to prevent XSS.
It's a tradeoff to keep code simple and to be secure by default.
Note: url should be normalized at this point, and existing entities decoded.
"""
if validator is not None:
return validator(url)
url = url.strip().lower()
return bool(GOOD_DATA_RE.search(url)) if BAD_PROTO_RE.search(url) else True

334
markdown_it/common/utils.py Normal file
View File

@ -0,0 +1,334 @@
"""Utilities for parsing source text
"""
import html
import re
from typing import Any
from .entities import entities
def charCodeAt(src: str, pos: int) -> Any:
"""
Returns the Unicode value of the character at the specified location.
@param - index The zero-based index of the desired character.
If there is no character at the specified index, NaN is returned.
This was added for compatibility with python
"""
try:
return ord(src[pos])
except IndexError:
return None
# Merge objects
#
def assign(obj):
"""Merge objects /*from1, from2, from3, ...*/)"""
raise NotImplementedError
# sources = Array.prototype.slice.call(arguments, 1)
# sources.forEach(function (source) {
# if (!source) { return; }
# if (typeof source !== 'object') {
# throw new TypeError(source + 'must be object')
# }
# Object.keys(source).forEach(function (key) {
# obj[key] = source[key]
# })
# })
# return obj
def arrayReplaceAt(src: list, pos: int, newElements: list) -> list:
"""
Remove element from array and put another array at those position.
Useful for some operations with tokens
"""
return src[:pos] + newElements + src[pos + 1 :]
######################################################################
def isValidEntityCode(c: int) -> bool:
# broken sequence
if c >= 0xD800 and c <= 0xDFFF:
return False
# never used
if c >= 0xFDD0 and c <= 0xFDEF:
return False
if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):
return False
# control codes
if c >= 0x00 and c <= 0x08:
return False
if c == 0x0B:
return False
if c >= 0x0E and c <= 0x1F:
return False
if c >= 0x7F and c <= 0x9F:
return False
# out of range
if c > 0x10FFFF:
return False
return True
def fromCodePoint(c: int) -> str:
"""Convert ordinal to unicode.
Note, in the original Javascript two string characters were required,
for codepoints larger than `0xFFFF`.
But Python 3 can represent any unicode codepoint in one character.
"""
return chr(c)
UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
UNESCAPE_ALL_RE = re.compile(
r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
re.IGNORECASE,
)
DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
def replaceEntityPattern(match: str, name: str) -> str:
"""Convert HTML entity patterns
::
https://www.google.com -> https%3A//www.google.com
"""
code = 0
if name in entities:
return entities[name]
if ord(name[0]) == 0x23 and DIGITAL_ENTITY_TEST_RE.search(name):
code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
if isValidEntityCode(code):
return fromCodePoint(code)
return match
# def replaceEntities(string):
# if (string.indexOf('&') < 0):
# return string
# return string.replace(ENTITY_RE, replaceEntityPattern)
def unescapeMd(string: str) -> str:
raise NotImplementedError
# if "\\" in string:
# return string
# return string.replace(UNESCAPE_MD_RE, "$1")
def unescapeAll(string: str) -> str:
def replacer_func(match):
escaped = match.group(1)
if escaped:
return escaped
entity = match.group(2)
return replaceEntityPattern(match.group(), entity)
if "\\" not in string and "&" not in string:
return string
return UNESCAPE_ALL_RE.sub(replacer_func, string)
ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""
ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")
def stripEscape(string: str) -> str:
"""Strip escape \\ characters"""
return ESCAPE_CHAR.sub(r"\1", string)
# //////////////////////////////////////////////////////////////////////////////
# TODO This section changed quite a lot, should re-check
# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")
# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")
# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')
# def escapeHtml(string: str):
# if HTML_ESCAPE_REPLACE_RE.search(string):
# string = UNESCAPE_HTML_RE.sub("&", string)
# string = ESCAPE_AND_HTML.sub("&amp;", string)
# for k, v in {"<": "&lt;", ">": "&gt;", '"': "&quot;"}.items():
# string = string.replace(k, v)
# return string
def escapeHtml(raw: str) -> str:
# return html.escape(html.unescape(raw)).replace("&#x27;", "'")
return html.escape(raw).replace("&#x27;", "'")
# //////////////////////////////////////////////////////////////////////////////
REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")
def escapeRE(string: str) -> str:
string = REGEXP_ESCAPE_RE.sub("\\$&", string)
return string
# //////////////////////////////////////////////////////////////////////////////
def isSpace(code: object) -> bool:
return code in {0x09, 0x20}
MD_WHITESPACE = {
0x09, # \t
0x0A, # \n
0x0B, # \v
0x0C, # \f
0x0D, # \r
0x20,
0xA0,
0x1680,
0x202F,
0x205F,
0x3000,
}
def isWhiteSpace(code: int) -> bool:
r"""Zs (unicode class) || [\t\f\v\r\n]"""
if code >= 0x2000 and code <= 0x200A:
return True
return code in MD_WHITESPACE
# //////////////////////////////////////////////////////////////////////////////
UNICODE_PUNCT_RE = re.compile(
r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501
)
# Currently without astral characters support.
def isPunctChar(ch: str) -> bool:
return UNICODE_PUNCT_RE.search(ch) is not None
MD_ASCII_PUNCT = {
0x21, # /* ! */
0x22, # /* " */
0x23, # /* # */
0x24, # /* $ */
0x25, # /* % */
0x26, # /* & */
0x27, # /* ' */
0x28, # /* ( */
0x29, # /* ) */
0x2A, # /* * */
0x2B, # /* + */
0x2C, # /* , */
0x2D, # /* - */
0x2E, # /* . */
0x2F, # /* / */
0x3A, # /* : */
0x3B, # /* ; */
0x3C, # /* < */
0x3D, # /* = */
0x3E, # /* > */
0x3F, # /* ? */
0x40, # /* @ */
0x5B, # /* [ */
0x5C, # /* \ */
0x5D, # /* ] */
0x5E, # /* ^ */
0x5F, # /* _ */
0x60, # /* ` */
0x7B, # /* { */
0x7C, # /* | */
0x7D, # /* } */
0x7E, # /* ~ */
}
def isMdAsciiPunct(ch: int) -> bool:
"""Markdown ASCII punctuation characters.
::
!, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~
See http://spec.commonmark.org/0.15/#ascii-punctuation-character
Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
""" # noqa: E501
return ch in MD_ASCII_PUNCT
def normalizeReference(string: str) -> str:
"""Helper to unify [reference labels]."""
# Trim and collapse whitespace
#
string = re.sub(r"\s+", " ", string.strip())
# In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
# fixed in v12 (couldn't find any details).
#
# So treat this one as a special case
# (remove this when node v10 is no longer supported).
#
# if ('ẞ'.toLowerCase() === 'Ṿ') {
# str = str.replace(/ẞ/g, 'ß')
# }
# .toLowerCase().toUpperCase() should get rid of all differences
# between letter variants.
#
# Simple .toLowerCase() doesn't normalize 125 code points correctly,
# and .toUpperCase doesn't normalize 6 of them (list of exceptions:
# İ, ϴ, ẞ, Ω, , Å - those are already uppercased, but have differently
# uppercased versions).
#
# Here's an example showing how it happens. Lets take greek letter omega:
# uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
#
# Unicode entries:
# 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8
# 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
# 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
# 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8
#
# Case-insensitive comparison should treat all of them as equivalent.
#
# But .toLowerCase() doesn't change ϑ (it's already lowercase),
# and .toUpperCase() doesn't change ϴ (already uppercase).
#
# Applying first lower then upper case normalizes any character:
# '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
#
# Note: this is equivalent to unicode case folding; unicode normalization
# is a different step that is not required here.
#
# Final result should be uppercased, because it's later stored in an object
# (this avoid a conflict with Object.prototype members,
# most notably, `__proto__`)
#
return string.lower().upper()

View File

@ -0,0 +1,6 @@
"""Functions for parsing Links
"""
__all__ = ("parseLinkLabel", "parseLinkDestination", "parseLinkTitle")
from .parse_link_destination import parseLinkDestination
from .parse_link_label import parseLinkLabel
from .parse_link_title import parseLinkTitle

View File

@ -0,0 +1,86 @@
"""
Parse link destination
"""
from ..common.utils import charCodeAt, unescapeAll
class _Result:
__slots__ = ("ok", "pos", "lines", "str")
def __init__(self):
self.ok = False
self.pos = 0
self.lines = 0
self.str = ""
def parseLinkDestination(string: str, pos: int, maximum: int) -> _Result:
lines = 0
start = pos
result = _Result()
if charCodeAt(string, pos) == 0x3C: # /* < */
pos += 1
while pos < maximum:
code = charCodeAt(string, pos)
if code == 0x0A: # /* \n */)
return result
if code == 0x3C: # / * < * /
return result
if code == 0x3E: # /* > */) {
result.pos = pos + 1
result.str = unescapeAll(string[start + 1 : pos])
result.ok = True
return result
if code == 0x5C and pos + 1 < maximum: # \
pos += 2
continue
pos += 1
# no closing '>'
return result
# this should be ... } else { ... branch
level = 0
while pos < maximum:
code = charCodeAt(string, pos)
if code == 0x20:
break
# ascii control characters
if code < 0x20 or code == 0x7F:
break
if code == 0x5C and pos + 1 < maximum:
if charCodeAt(string, pos + 1) == 0x20:
break
pos += 2
continue
if code == 0x28: # /* ( */)
level += 1
if level > 32:
return result
if code == 0x29: # /* ) */)
if level == 0:
break
level -= 1
pos += 1
if start == pos:
return result
if level != 0:
return result
result.str = unescapeAll(string[start:pos])
result.lines = lines
result.pos = pos
result.ok = True
return result

View File

@ -0,0 +1,44 @@
"""
Parse link label
this function assumes that first character ("[") already matches
returns the end of the label
"""
from markdown_it.rules_inline import StateInline
def parseLinkLabel(state: StateInline, start: int, disableNested: bool = False) -> int:
labelEnd = -1
oldPos = state.pos
found = False
state.pos = start + 1
level = 1
while state.pos < state.posMax:
marker = state.srcCharCode[state.pos]
if marker == 0x5D: # /* ] */)
level -= 1
if level == 0:
found = True
break
prevPos = state.pos
state.md.inline.skipToken(state)
if marker == 0x5B: # /* [ */)
if prevPos == state.pos - 1:
# increase level if we find text `[`,
# which is not a part of any token
level += 1
elif disableNested:
state.pos = oldPos
return -1
if found:
labelEnd = state.pos
# restore old state
state.pos = oldPos
return labelEnd

View File

@ -0,0 +1,60 @@
"""Parse link title
"""
from ..common.utils import charCodeAt, unescapeAll
class _Result:
__slots__ = ("ok", "pos", "lines", "str")
def __init__(self):
self.ok = False
self.pos = 0
self.lines = 0
self.str = ""
def __str__(self):
return self.str
def parseLinkTitle(string: str, pos: int, maximum: int) -> _Result:
lines = 0
start = pos
result = _Result()
if pos >= maximum:
return result
marker = charCodeAt(string, pos)
# /* " */ /* ' */ /* ( */
if marker != 0x22 and marker != 0x27 and marker != 0x28:
return result
pos += 1
# if opening marker is "(", switch it to closing marker ")"
if marker == 0x28:
marker = 0x29
while pos < maximum:
code = charCodeAt(string, pos)
if code == marker:
title = string[start + 1 : pos]
title = unescapeAll(title)
result.pos = pos + 1
result.lines = lines
result.str = title
result.ok = True
return result
elif code == 0x28 and marker == 0x29: # /* ( */ /* ) */
return result
elif code == 0x0A:
lines += 1
elif code == 0x5C and pos + 1 < maximum: # /* \ */
pos += 1
if charCodeAt(string, pos) == 0x0A:
lines += 1
pos += 1
return result

331
markdown_it/main.py Normal file
View File

@ -0,0 +1,331 @@
from __future__ import annotations
from collections.abc import Callable, Generator, Iterable, Mapping, MutableMapping
from contextlib import contextmanager
from typing import Any
from . import helpers, presets # noqa F401
from .common import normalize_url, utils # noqa F401
from .parser_block import ParserBlock # noqa F401
from .parser_core import ParserCore # noqa F401
from .parser_inline import ParserInline # noqa F401
from .renderer import RendererHTML, RendererProtocol
from .rules_core.state_core import StateCore
from .token import Token
from .utils import OptionsDict
try:
import linkify_it
except ModuleNotFoundError:
linkify_it = None
_PRESETS = {
"default": presets.default.make(),
"js-default": presets.js_default.make(),
"zero": presets.zero.make(),
"commonmark": presets.commonmark.make(),
"gfm-like": presets.gfm_like.make(),
}
class MarkdownIt:
def __init__(
self,
config: str | Mapping = "commonmark",
options_update: Mapping | None = None,
*,
renderer_cls: Callable[[MarkdownIt], RendererProtocol] = RendererHTML,
):
"""Main parser class
:param config: name of configuration to load or a pre-defined dictionary
:param options_update: dictionary that will be merged into ``config["options"]``
:param renderer_cls: the class to load as the renderer:
``self.renderer = renderer_cls(self)
"""
# add modules
self.utils = utils
self.helpers: Any = helpers
# initialise classes
self.inline = ParserInline()
self.block = ParserBlock()
self.core = ParserCore()
self.renderer = renderer_cls(self)
self.linkify = linkify_it.LinkifyIt() if linkify_it else None
# set the configuration
if options_update and not isinstance(options_update, Mapping):
# catch signature change where renderer_cls was not used as a key-word
raise TypeError(
f"options_update should be a mapping: {options_update}"
"\n(Perhaps you intended this to be the renderer_cls?)"
)
self.configure(config, options_update=options_update)
def __repr__(self) -> str:
return f"{self.__class__.__module__}.{self.__class__.__name__}()"
def __getitem__(self, name: str) -> Any:
return {
"inline": self.inline,
"block": self.block,
"core": self.core,
"renderer": self.renderer,
}[name]
def set(self, options: MutableMapping) -> None:
"""Set parser options (in the same format as in constructor).
Probably, you will never need it, but you can change options after constructor call.
__Note:__ To achieve the best possible performance, don't modify a
`markdown-it` instance options on the fly. If you need multiple configurations
it's best to create multiple instances and initialize each with separate config.
"""
self.options = OptionsDict(options)
def configure(
self, presets: str | Mapping, options_update: Mapping | None = None
) -> MarkdownIt:
"""Batch load of all options and component settings.
This is an internal method, and you probably will not need it.
But if you will - see available presets and data structure
[here](https://github.com/markdown-it/markdown-it/tree/master/lib/presets)
We strongly recommend to use presets instead of direct config loads.
That will give better compatibility with next versions.
"""
if isinstance(presets, str):
if presets not in _PRESETS:
raise KeyError(f"Wrong `markdown-it` preset '{presets}', check name")
config = _PRESETS[presets]
else:
config = presets
if not config:
raise ValueError("Wrong `markdown-it` config, can't be empty")
options = config.get("options", {}) or {}
if options_update:
options = {**options, **options_update}
self.set(options)
if "components" in config:
for name, component in config["components"].items():
rules = component.get("rules", None)
if rules:
self[name].ruler.enableOnly(rules)
rules2 = component.get("rules2", None)
if rules2:
self[name].ruler2.enableOnly(rules2)
return self
def get_all_rules(self) -> dict[str, list[str]]:
"""Return the names of all active rules."""
rules = {
chain: self[chain].ruler.get_all_rules()
for chain in ["core", "block", "inline"]
}
rules["inline2"] = self.inline.ruler2.get_all_rules()
return rules
def get_active_rules(self) -> dict[str, list[str]]:
"""Return the names of all active rules."""
rules = {
chain: self[chain].ruler.get_active_rules()
for chain in ["core", "block", "inline"]
}
rules["inline2"] = self.inline.ruler2.get_active_rules()
return rules
def enable(
self, names: str | Iterable[str], ignoreInvalid: bool = False
) -> MarkdownIt:
"""Enable list or rules. (chainable)
:param names: rule name or list of rule names to enable.
:param ignoreInvalid: set `true` to ignore errors when rule not found.
It will automatically find appropriate components,
containing rules with given names. If rule not found, and `ignoreInvalid`
not set - throws exception.
Example::
md = MarkdownIt().enable(['sub', 'sup']).disable('smartquotes')
"""
result = []
if isinstance(names, str):
names = [names]
for chain in ["core", "block", "inline"]:
result.extend(self[chain].ruler.enable(names, True))
result.extend(self.inline.ruler2.enable(names, True))
missed = [name for name in names if name not in result]
if missed and not ignoreInvalid:
raise ValueError(f"MarkdownIt. Failed to enable unknown rule(s): {missed}")
return self
def disable(
self, names: str | Iterable[str], ignoreInvalid: bool = False
) -> MarkdownIt:
"""The same as [[MarkdownIt.enable]], but turn specified rules off. (chainable)
:param names: rule name or list of rule names to disable.
:param ignoreInvalid: set `true` to ignore errors when rule not found.
"""
result = []
if isinstance(names, str):
names = [names]
for chain in ["core", "block", "inline"]:
result.extend(self[chain].ruler.disable(names, True))
result.extend(self.inline.ruler2.disable(names, True))
missed = [name for name in names if name not in result]
if missed and not ignoreInvalid:
raise ValueError(f"MarkdownIt. Failed to disable unknown rule(s): {missed}")
return self
@contextmanager
def reset_rules(self) -> Generator[None, None, None]:
"""A context manager, that will reset the current enabled rules on exit."""
chain_rules = self.get_active_rules()
yield
for chain, rules in chain_rules.items():
if chain != "inline2":
self[chain].ruler.enableOnly(rules)
self.inline.ruler2.enableOnly(chain_rules["inline2"])
def add_render_rule(self, name: str, function: Callable, fmt: str = "html") -> None:
"""Add a rule for rendering a particular Token type.
Only applied when ``renderer.__output__ == fmt``
"""
if self.renderer.__output__ == fmt:
self.renderer.rules[name] = function.__get__(self.renderer) # type: ignore
def use(self, plugin: Callable, *params, **options) -> MarkdownIt:
"""Load specified plugin with given params into current parser instance. (chainable)
It's just a sugar to call `plugin(md, params)` with curring.
Example::
def func(tokens, idx):
tokens[idx].content = tokens[idx].content.replace('foo', 'bar')
md = MarkdownIt().use(plugin, 'foo_replace', 'text', func)
"""
plugin(self, *params, **options)
return self
def parse(self, src: str, env: MutableMapping | None = None) -> list[Token]:
"""Parse the source string to a token stream
:param src: source string
:param env: environment sandbox
Parse input string and return list of block tokens (special token type
"inline" will contain list of inline tokens).
`env` is used to pass data between "distributed" rules and return additional
metadata like reference info, needed for the renderer. It also can be used to
inject data in specific cases. Usually, you will be ok to pass `{}`,
and then pass updated object to renderer.
"""
env = {} if env is None else env
if not isinstance(env, MutableMapping):
raise TypeError(f"Input data should be a MutableMapping, not {type(env)}")
if not isinstance(src, str):
raise TypeError(f"Input data should be a string, not {type(src)}")
state = StateCore(src, self, env)
self.core.process(state)
return state.tokens
def render(self, src: str, env: MutableMapping | None = None) -> Any:
"""Render markdown string into html. It does all magic for you :).
:param src: source string
:param env: environment sandbox
:returns: The output of the loaded renderer
`env` can be used to inject additional metadata (`{}` by default).
But you will not need it with high probability. See also comment
in [[MarkdownIt.parse]].
"""
env = {} if env is None else env
return self.renderer.render(self.parse(src, env), self.options, env)
def parseInline(self, src: str, env: MutableMapping | None = None) -> list[Token]:
"""The same as [[MarkdownIt.parse]] but skip all block rules.
:param src: source string
:param env: environment sandbox
It returns the
block tokens list with the single `inline` element, containing parsed inline
tokens in `children` property. Also updates `env` object.
"""
env = {} if env is None else env
if not isinstance(env, MutableMapping):
raise TypeError(f"Input data should be an MutableMapping, not {type(env)}")
if not isinstance(src, str):
raise TypeError(f"Input data should be a string, not {type(src)}")
state = StateCore(src, self, env)
state.inlineMode = True
self.core.process(state)
return state.tokens
def renderInline(self, src: str, env: MutableMapping | None = None) -> Any:
"""Similar to [[MarkdownIt.render]] but for single paragraph content.
:param src: source string
:param env: environment sandbox
Similar to [[MarkdownIt.render]] but for single paragraph content. Result
will NOT be wrapped into `<p>` tags.
"""
env = {} if env is None else env
return self.renderer.render(self.parseInline(src, env), self.options, env)
# link methods
def validateLink(self, url: str) -> bool:
"""Validate if the URL link is allowed in output.
This validator can prohibit more than really needed to prevent XSS.
It's a tradeoff to keep code simple and to be secure by default.
Note: the url should be normalized at this point, and existing entities decoded.
"""
return normalize_url.validateLink(url)
def normalizeLink(self, url: str) -> str:
"""Normalize destination URLs in links
::
[label]: destination 'title'
^^^^^^^^^^^
"""
return normalize_url.normalizeLink(url)
def normalizeLinkText(self, link: str) -> str:
"""Normalize autolink content
::
<destination>
~~~~~~~~~~~
"""
return normalize_url.normalizeLinkText(link)

109
markdown_it/parser_block.py Normal file
View File

@ -0,0 +1,109 @@
"""Block-level tokenizer."""
from __future__ import annotations
import logging
from . import rules_block
from .ruler import Ruler
from .rules_block.state_block import StateBlock
from .token import Token
LOGGER = logging.getLogger(__name__)
_rules: list[tuple] = [
# First 2 params - rule name & source. Secondary array - list of rules,
# which can be terminated by this one.
("table", rules_block.table, ["paragraph", "reference"]),
("code", rules_block.code),
("fence", rules_block.fence, ["paragraph", "reference", "blockquote", "list"]),
(
"blockquote",
rules_block.blockquote,
["paragraph", "reference", "blockquote", "list"],
),
("hr", rules_block.hr, ["paragraph", "reference", "blockquote", "list"]),
("list", rules_block.list_block, ["paragraph", "reference", "blockquote"]),
("reference", rules_block.reference),
("html_block", rules_block.html_block, ["paragraph", "reference", "blockquote"]),
("heading", rules_block.heading, ["paragraph", "reference", "blockquote"]),
("lheading", rules_block.lheading),
("paragraph", rules_block.paragraph),
]
class ParserBlock:
"""
ParserBlock#ruler -> Ruler
[[Ruler]] instance. Keep configuration of block rules.
"""
def __init__(self):
self.ruler = Ruler()
for data in _rules:
name = data[0]
rule = data[1]
self.ruler.push(name, rule, {"alt": data[2] if len(data) > 2 else []})
def tokenize(
self, state: StateBlock, startLine: int, endLine: int, silent: bool = False
) -> None:
"""Generate tokens for input range."""
rules = self.ruler.getRules("")
line = startLine
maxNesting = state.md.options.maxNesting
hasEmptyLines = False
while line < endLine:
state.line = line = state.skipEmptyLines(line)
if line >= endLine:
break
if state.sCount[line] < state.blkIndent:
# Termination condition for nested calls.
# Nested calls currently used for blockquotes & lists
break
if state.level >= maxNesting:
# If nesting level exceeded - skip tail to the end.
# That's not ordinary situation and we should not care about content.
state.line = endLine
break
# Try all possible rules.
# On success, rule should:
# - update `state.line`
# - update `state.tokens`
# - return True
for rule in rules:
if rule(state, line, endLine, False):
break
# set state.tight if we had an empty line before current tag
# i.e. latest empty line should not count
state.tight = not hasEmptyLines
line = state.line
# paragraph might "eat" one newline after it in nested lists
if (line - 1) < endLine and state.isEmpty(line - 1):
hasEmptyLines = True
if line < endLine and state.isEmpty(line):
hasEmptyLines = True
line += 1
state.line = line
def parse(
self,
src: str,
md,
env,
outTokens: list[Token],
ords: tuple[int, ...] | None = None,
) -> list[Token] | None:
"""Process input string and push block tokens into `outTokens`."""
if not src:
return None
state = StateBlock(src, md, env, outTokens, ords)
self.tokenize(state, state.line, state.lineMax)
return state.tokens

View File

@ -0,0 +1,32 @@
"""
* class Core
*
* Top-level rules executor. Glues block/inline parsers and does intermediate
* transformations.
"""
from __future__ import annotations
from .ruler import RuleFunc, Ruler
from .rules_core import block, inline, linkify, normalize, replace, smartquotes
from .rules_core.state_core import StateCore
_rules: list[tuple[str, RuleFunc]] = [
("normalize", normalize),
("block", block),
("inline", inline),
("linkify", linkify),
("replacements", replace),
("smartquotes", smartquotes),
]
class ParserCore:
def __init__(self):
self.ruler = Ruler()
for name, rule in _rules:
self.ruler.push(name, rule)
def process(self, state: StateCore) -> None:
"""Executes core chain rules."""
for rule in self.ruler.getRules(""):
rule(state)

View File

@ -0,0 +1,124 @@
"""Tokenizes paragraph content.
"""
from __future__ import annotations
from . import rules_inline
from .ruler import RuleFunc, Ruler
from .rules_inline.state_inline import StateInline
from .token import Token
# Parser rules
_rules: list[tuple[str, RuleFunc]] = [
("text", rules_inline.text),
("newline", rules_inline.newline),
("escape", rules_inline.escape),
("backticks", rules_inline.backtick),
("strikethrough", rules_inline.strikethrough.tokenize),
("emphasis", rules_inline.emphasis.tokenize),
("link", rules_inline.link),
("image", rules_inline.image),
("autolink", rules_inline.autolink),
("html_inline", rules_inline.html_inline),
("entity", rules_inline.entity),
]
_rules2: list[tuple[str, RuleFunc]] = [
("balance_pairs", rules_inline.link_pairs),
("strikethrough", rules_inline.strikethrough.postProcess),
("emphasis", rules_inline.emphasis.postProcess),
("text_collapse", rules_inline.text_collapse),
]
class ParserInline:
def __init__(self):
self.ruler = Ruler()
for name, rule in _rules:
self.ruler.push(name, rule)
# Second ruler used for post-processing (e.g. in emphasis-like rules)
self.ruler2 = Ruler()
for name, rule2 in _rules2:
self.ruler2.push(name, rule2)
def skipToken(self, state: StateInline) -> None:
"""Skip single token by running all rules in validation mode;
returns `True` if any rule reported success
"""
ok = False
pos = state.pos
rules = self.ruler.getRules("")
maxNesting = state.md.options["maxNesting"]
cache = state.cache
if pos in cache:
state.pos = cache[pos]
return
if state.level < maxNesting:
for rule in rules:
# Increment state.level and decrement it later to limit recursion.
# It's harmless to do here, because no tokens are created.
# But ideally, we'd need a separate private state variable for this purpose.
state.level += 1
ok = rule(state, True)
state.level -= 1
if ok:
break
else:
# Too much nesting, just skip until the end of the paragraph.
#
# NOTE: this will cause links to behave incorrectly in the following case,
# when an amount of `[` is exactly equal to `maxNesting + 1`:
#
# [[[[[[[[[[[[[[[[[[[[[foo]()
#
# TODO: remove this workaround when CM standard will allow nested links
# (we can replace it by preventing links from being parsed in
# validation mode)
#
state.pos = state.posMax
if not ok:
state.pos += 1
cache[pos] = state.pos
def tokenize(self, state: StateInline) -> None:
"""Generate tokens for input range."""
ok = False
rules = self.ruler.getRules("")
end = state.posMax
maxNesting = state.md.options["maxNesting"]
while state.pos < end:
# Try all possible rules.
# On success, rule should:
#
# - update `state.pos`
# - update `state.tokens`
# - return true
if state.level < maxNesting:
for rule in rules:
ok = rule(state, False)
if ok:
break
if ok:
if state.pos >= end:
break
continue
state.pending += state.src[state.pos]
state.pos += 1
if state.pending:
state.pushPending()
def parse(self, src: str, md, env, tokens: list[Token]) -> list[Token]:
"""Process input string and push inline tokens into `tokens`"""
state = StateInline(src, md, env, tokens)
self.tokenize(state)
rules2 = self.ruler2.getRules("")
for rule in rules2:
rule(state)
return state.tokens

49
markdown_it/port.yaml Normal file
View File

@ -0,0 +1,49 @@
- package: markdown-it/markdown-it
version: 12.2.0
commit: 6e2de08a0b03d3d0dcc524b89710ce05f83a0283
date: Aug 2, 2021
notes:
- Rename variables that use python built-in names, e.g.
- `max` -> `maximum`
- `len` -> `length`
- `str` -> `string`
- |
Convert JS `for` loops to `while` loops
this is generally the main difference between the codes,
because in python you can't do e.g. `for {i=1;i<x;i++} {}`
- |
`env` is a common Python dictionary, and so does not have attribute access to keys,
as with JavaScript dictionaries.
`options` have attribute access only to core markdownit configuration options
- |
`Token.attrs` is a dictionary, instead of a list of lists.
Upstream the list format is only used to guarantee order: https://github.com/markdown-it/markdown-it/issues/142,
but in Python 3.7+ order of dictionaries is guaranteed.
One should anyhow use the `attrGet`, `attrSet`, `attrPush` and `attrJoin` methods
to manipulate `Token.attrs`, which have an identical signature to those upstream.
- Use python version of `charCodeAt`
- |
Reduce use of charCodeAt() by storing char codes in a srcCharCodes attribute for state
objects and sharing those whenever possible
This provides a significant performance boost
- |
In markdown_it/rules_block/reference.py,
record line range in state.env["references"] and add state.env["duplicate_refs"]
This is to allow renderers to report on issues regarding references
- |
The `MarkdownIt.__init__` signature is slightly different for updating options,
since you must always specify the config first, e.g.
use `MarkdownIt("commonmark", {"html": False})` instead of `MarkdownIt({"html": False})`
- The default configuration preset for `MarkdownIt` is "commonmark" not "default"
- Allow custom renderer to be passed to `MarkdownIt`
- |
change render method signatures
`func(tokens, idx, options, env, slf)` to
`func(self, tokens, idx, options, env)`
- |
Extensions add render methods by format
`MarkdownIt.add_render_rule(name, function, fmt="html")`,
rather than `MarkdownIt.renderer.rules[name] = function`
and renderers should declare a class property `__output__ = "html"`.
This allows for extensibility to more than just HTML renderers
- inline tokens in tables are assigned a map (this is helpful for propagation to children)

View File

@ -0,0 +1,27 @@
__all__ = ("commonmark", "default", "zero", "js_default", "gfm_like")
from . import commonmark, default, zero
js_default = default
class gfm_like:
"""GitHub Flavoured Markdown (GFM) like.
This adds the linkify, table and strikethrough components to CommmonMark.
Note, it lacks task-list items and raw HTML filtering,
to meet the the full GFM specification
(see https://github.github.com/gfm/#autolinks-extension-).
"""
@staticmethod
def make():
config = commonmark.make()
config["components"]["core"]["rules"].append("linkify")
config["components"]["block"]["rules"].append("table")
config["components"]["inline"]["rules"].append("strikethrough")
config["components"]["inline"]["rules2"].append("strikethrough")
config["options"]["linkify"] = True
config["options"]["html"] = True
return config

View File

@ -0,0 +1,73 @@
"""Commonmark default options.
This differs to presets.default,
primarily in that it allows HTML and does not enable components:
- block: table
- inline: strikethrough
"""
def make():
return {
"options": {
"maxNesting": 20, # Internal protection, recursion limit
"html": True, # Enable HTML tags in source,
# this is just a shorthand for .enable(["html_inline", "html_block"])
# used by the linkify rule:
"linkify": False, # autoconvert URL-like texts to links
# used by the replacements and smartquotes rules
# Enable some language-neutral replacements + quotes beautification
"typographer": False,
# used by the smartquotes rule:
# Double + single quotes replacement pairs, when typographer enabled,
# and smartquotes on. Could be either a String or an Array.
#
# For example, you can use '«»„“' for Russian, '„“‚‘' for German,
# and ['«\xA0', '\xA0»', '\xA0', '\xA0'] for French (including nbsp).
"quotes": "\u201c\u201d\u2018\u2019", # /* “”‘’ */
# Renderer specific; these options are used directly in the HTML renderer
"xhtmlOut": True, # Use '/' to close single tags (<br />)
"breaks": False, # Convert '\n' in paragraphs into <br>
"langPrefix": "language-", # CSS language prefix for fenced blocks
# Highlighter function. Should return escaped HTML,
# or '' if the source string is not changed and should be escaped externally.
# If result starts with <pre... internal wrapper is skipped.
#
# function (/*str, lang, attrs*/) { return ''; }
#
"highlight": None,
},
"components": {
"core": {"rules": ["normalize", "block", "inline"]},
"block": {
"rules": [
"blockquote",
"code",
"fence",
"heading",
"hr",
"html_block",
"lheading",
"list",
"reference",
"paragraph",
]
},
"inline": {
"rules": [
"autolink",
"backticks",
"emphasis",
"entity",
"escape",
"html_inline",
"image",
"link",
"newline",
"text",
],
"rules2": ["balance_pairs", "emphasis", "text_collapse"],
},
},
}

View File

@ -0,0 +1,34 @@
"""markdown-it default options."""
def make():
return {
"options": {
"maxNesting": 100, # Internal protection, recursion limit
"html": False, # Enable HTML tags in source
# this is just a shorthand for .disable(["html_inline", "html_block"])
# used by the linkify rule:
"linkify": False, # autoconvert URL-like texts to links
# used by the replacements and smartquotes rules:
# Enable some language-neutral replacements + quotes beautification
"typographer": False,
# used by the smartquotes rule:
# Double + single quotes replacement pairs, when typographer enabled,
# and smartquotes on. Could be either a String or an Array.
# For example, you can use '«»„“' for Russian, '„“‚‘' for German,
# and ['«\xA0', '\xA0»', '\xA0', '\xA0'] for French (including nbsp).
"quotes": "\u201c\u201d\u2018\u2019", # /* “”‘’ */
# Renderer specific; these options are used directly in the HTML renderer
"xhtmlOut": False, # Use '/' to close single tags (<br />)
"breaks": False, # Convert '\n' in paragraphs into <br>
"langPrefix": "language-", # CSS language prefix for fenced blocks
# Highlighter function. Should return escaped HTML,
# or '' if the source string is not changed and should be escaped externally.
# If result starts with <pre... internal wrapper is skipped.
#
# function (/*str, lang, attrs*/) { return ''; }
#
"highlight": None,
},
"components": {"core": {}, "block": {}, "inline": {}},
}

View File

@ -0,0 +1,39 @@
"""
"Zero" preset, with nothing enabled. Useful for manual configuring of simple
modes. For example, to parse bold/italic only.
"""
def make():
return {
"options": {
"maxNesting": 20, # Internal protection, recursion limit
"html": False, # Enable HTML tags in source
# this is just a shorthand for .disable(["html_inline", "html_block"])
# used by the linkify rule:
"linkify": False, # autoconvert URL-like texts to links
# used by the replacements and smartquotes rules:
# Enable some language-neutral replacements + quotes beautification
"typographer": False,
# used by the smartquotes rule:
# Double + single quotes replacement pairs, when typographer enabled,
# and smartquotes on. Could be either a String or an Array.
# For example, you can use '«»„“' for Russian, '„“‚‘' for German,
# and ['«\xA0', '\xA0»', '\xA0', '\xA0'] for French (including nbsp).
"quotes": "\u201c\u201d\u2018\u2019", # /* “”‘’ */
# Renderer specific; these options are used directly in the HTML renderer
"xhtmlOut": False, # Use '/' to close single tags (<br />)
"breaks": False, # Convert '\n' in paragraphs into <br>
"langPrefix": "language-", # CSS language prefix for fenced blocks
# Highlighter function. Should return escaped HTML,
# or '' if the source string is not changed and should be escaped externally.
# If result starts with <pre... internal wrapper is skipped.
# function (/*str, lang, attrs*/) { return ''; }
"highlight": None,
},
"components": {
"core": {"rules": ["normalize", "block", "inline"]},
"block": {"rules": ["paragraph"]},
"inline": {"rules": ["text"], "rules2": ["balance_pairs", "text_collapse"]},
},
}

1
markdown_it/py.typed Normal file
View File

@ -0,0 +1 @@
# Marker file for PEP 561

339
markdown_it/renderer.py Normal file
View File

@ -0,0 +1,339 @@
"""
class Renderer
Generates HTML from parsed token stream. Each instance has independent
copy of rules. Those can be rewritten with ease. Also, you can add new
rules if you create plugin and adds new token types.
"""
from __future__ import annotations
from collections.abc import MutableMapping, Sequence
import inspect
from typing import Any, ClassVar
from .common.utils import escapeHtml, unescapeAll
from .token import Token
from .utils import OptionsDict
try:
from typing import Protocol
except ImportError: # Python <3.8 doesn't have `Protocol` in the stdlib
from typing_extensions import Protocol # type: ignore[misc]
class RendererProtocol(Protocol):
__output__: ClassVar[str]
def render(
self, tokens: Sequence[Token], options: OptionsDict, env: MutableMapping
) -> Any:
...
class RendererHTML(RendererProtocol):
"""Contains render rules for tokens. Can be updated and extended.
Example:
Each rule is called as independent static function with fixed signature:
::
class Renderer:
def token_type_name(self, tokens, idx, options, env) {
# ...
return renderedHTML
::
class CustomRenderer(RendererHTML):
def strong_open(self, tokens, idx, options, env):
return '<b>'
def strong_close(self, tokens, idx, options, env):
return '</b>'
md = MarkdownIt(renderer_cls=CustomRenderer)
result = md.render(...)
See https://github.com/markdown-it/markdown-it/blob/master/lib/renderer.js
for more details and examples.
"""
__output__ = "html"
def __init__(self, parser=None):
self.rules = {
k: v
for k, v in inspect.getmembers(self, predicate=inspect.ismethod)
if not (k.startswith("render") or k.startswith("_"))
}
def render(
self, tokens: Sequence[Token], options: OptionsDict, env: MutableMapping
) -> str:
"""Takes token stream and generates HTML.
:param tokens: list on block tokens to render
:param options: params of parser instance
:param env: additional data from parsed input
"""
result = ""
for i, token in enumerate(tokens):
if token.type == "inline":
assert token.children is not None
result += self.renderInline(token.children, options, env)
elif token.type in self.rules:
result += self.rules[token.type](tokens, i, options, env)
else:
result += self.renderToken(tokens, i, options, env)
return result
def renderInline(
self, tokens: Sequence[Token], options: OptionsDict, env: MutableMapping
) -> str:
"""The same as ``render``, but for single token of `inline` type.
:param tokens: list on block tokens to render
:param options: params of parser instance
:param env: additional data from parsed input (references, for example)
"""
result = ""
for i, token in enumerate(tokens):
if token.type in self.rules:
result += self.rules[token.type](tokens, i, options, env)
else:
result += self.renderToken(tokens, i, options, env)
return result
def renderToken(
self,
tokens: Sequence[Token],
idx: int,
options: OptionsDict,
env: MutableMapping,
) -> str:
"""Default token renderer.
Can be overridden by custom function
:param idx: token index to render
:param options: params of parser instance
"""
result = ""
needLf = False
token = tokens[idx]
# Tight list paragraphs
if token.hidden:
return ""
# Insert a newline between hidden paragraph and subsequent opening
# block-level tag.
#
# For example, here we should insert a newline before blockquote:
# - a
# >
#
if token.block and token.nesting != -1 and idx and tokens[idx - 1].hidden:
result += "\n"
# Add token name, e.g. `<img`
result += ("</" if token.nesting == -1 else "<") + token.tag
# Encode attributes, e.g. `<img src="foo"`
result += self.renderAttrs(token)
# Add a slash for self-closing tags, e.g. `<img src="foo" /`
if token.nesting == 0 and options["xhtmlOut"]:
result += " /"
# Check if we need to add a newline after this tag
if token.block:
needLf = True
if token.nesting == 1:
if idx + 1 < len(tokens):
nextToken = tokens[idx + 1]
if nextToken.type == "inline" or nextToken.hidden:
# Block-level tag containing an inline tag.
#
needLf = False
elif nextToken.nesting == -1 and nextToken.tag == token.tag:
# Opening tag + closing tag of the same type. E.g. `<li></li>`.
#
needLf = False
result += ">\n" if needLf else ">"
return result
@staticmethod
def renderAttrs(token: Token) -> str:
"""Render token attributes to string."""
result = ""
for key, value in token.attrItems():
result += " " + escapeHtml(key) + '="' + escapeHtml(str(value)) + '"'
return result
def renderInlineAsText(
self,
tokens: Sequence[Token] | None,
options: OptionsDict,
env: MutableMapping,
) -> str:
"""Special kludge for image `alt` attributes to conform CommonMark spec.
Don't try to use it! Spec requires to show `alt` content with stripped markup,
instead of simple escaping.
:param tokens: list on block tokens to render
:param options: params of parser instance
:param env: additional data from parsed input
"""
result = ""
for token in tokens or []:
if token.type == "text":
result += token.content
elif token.type == "image":
assert token.children is not None
result += self.renderInlineAsText(token.children, options, env)
elif token.type == "softbreak":
result += "\n"
return result
###################################################
def code_inline(self, tokens: Sequence[Token], idx: int, options, env) -> str:
token = tokens[idx]
return (
"<code"
+ self.renderAttrs(token)
+ ">"
+ escapeHtml(tokens[idx].content)
+ "</code>"
)
def code_block(
self,
tokens: Sequence[Token],
idx: int,
options: OptionsDict,
env: MutableMapping,
) -> str:
token = tokens[idx]
return (
"<pre"
+ self.renderAttrs(token)
+ "><code>"
+ escapeHtml(tokens[idx].content)
+ "</code></pre>\n"
)
def fence(
self,
tokens: Sequence[Token],
idx: int,
options: OptionsDict,
env: MutableMapping,
) -> str:
token = tokens[idx]
info = unescapeAll(token.info).strip() if token.info else ""
langName = ""
langAttrs = ""
if info:
arr = info.split(maxsplit=1)
langName = arr[0]
if len(arr) == 2:
langAttrs = arr[1]
if options.highlight:
highlighted = options.highlight(
token.content, langName, langAttrs
) or escapeHtml(token.content)
else:
highlighted = escapeHtml(token.content)
if highlighted.startswith("<pre"):
return highlighted + "\n"
# If language exists, inject class gently, without modifying original token.
# May be, one day we will add .deepClone() for token and simplify this part, but
# now we prefer to keep things local.
if info:
# Fake token just to render attributes
tmpToken = Token(type="", tag="", nesting=0, attrs=token.attrs.copy())
tmpToken.attrJoin("class", options.langPrefix + langName)
return (
"<pre><code"
+ self.renderAttrs(tmpToken)
+ ">"
+ highlighted
+ "</code></pre>\n"
)
return (
"<pre><code"
+ self.renderAttrs(token)
+ ">"
+ highlighted
+ "</code></pre>\n"
)
def image(
self,
tokens: Sequence[Token],
idx: int,
options: OptionsDict,
env: MutableMapping,
) -> str:
token = tokens[idx]
# "alt" attr MUST be set, even if empty. Because it's mandatory and
# should be placed on proper position for tests.
assert (
token.attrs and "alt" in token.attrs
), '"image" token\'s attrs must contain `alt`'
# Replace content with actual value
token.attrSet("alt", self.renderInlineAsText(token.children, options, env))
return self.renderToken(tokens, idx, options, env)
def hardbreak(
self, tokens: Sequence[Token], idx: int, options: OptionsDict, *args
) -> str:
return "<br />\n" if options.xhtmlOut else "<br>\n"
def softbreak(
self, tokens: Sequence[Token], idx: int, options: OptionsDict, *args
) -> str:
return (
("<br />\n" if options.xhtmlOut else "<br>\n") if options.breaks else "\n"
)
def text(self, tokens: Sequence[Token], idx: int, *args) -> str:
return escapeHtml(tokens[idx].content)
def html_block(self, tokens: Sequence[Token], idx: int, *args) -> str:
return tokens[idx].content
def html_inline(self, tokens: Sequence[Token], idx: int, *args) -> str:
return tokens[idx].content

237
markdown_it/ruler.py Normal file
View File

@ -0,0 +1,237 @@
"""
class Ruler
Helper class, used by [[MarkdownIt#core]], [[MarkdownIt#block]] and
[[MarkdownIt#inline]] to manage sequences of functions (rules):
- keep rules in defined order
- assign the name to each rule
- enable/disable rules
- add/replace rules
- allow assign rules to additional named chains (in the same)
- caching lists of active rules
You will not need use this class directly until write plugins. For simple
rules control use [[MarkdownIt.disable]], [[MarkdownIt.enable]] and
[[MarkdownIt.use]].
"""
from __future__ import annotations
from collections.abc import Callable, Iterable, MutableMapping
from dataclasses import dataclass, field
from typing import TYPE_CHECKING
from markdown_it._compat import DATACLASS_KWARGS
if TYPE_CHECKING:
from markdown_it import MarkdownIt
class StateBase:
srcCharCode: tuple[int, ...]
def __init__(self, src: str, md: MarkdownIt, env: MutableMapping):
self.src = src
self.env = env
self.md = md
@property
def src(self) -> str:
return self._src
@src.setter
def src(self, value: str) -> None:
self._src = value
self.srcCharCode = tuple(ord(c) for c in self.src)
# The first positional arg is always a subtype of `StateBase`. Other
# arguments may or may not exist, based on the rule's type (block,
# core, inline). Return type is either `None` or `bool` based on the
# rule's type.
RuleFunc = Callable
@dataclass(**DATACLASS_KWARGS)
class Rule:
name: str
enabled: bool
fn: RuleFunc = field(repr=False)
alt: list[str]
class Ruler:
def __init__(self):
# List of added rules.
self.__rules__: list[Rule] = []
# Cached rule chains.
# First level - chain name, '' for default.
# Second level - diginal anchor for fast filtering by charcodes.
self.__cache__: dict[str, list[RuleFunc]] | None = None
def __find__(self, name: str) -> int:
"""Find rule index by name"""
for i, rule in enumerate(self.__rules__):
if rule.name == name:
return i
return -1
def __compile__(self) -> None:
"""Build rules lookup cache"""
chains = {""}
# collect unique names
for rule in self.__rules__:
if not rule.enabled:
continue
for name in rule.alt:
chains.add(name)
self.__cache__ = {}
for chain in chains:
self.__cache__[chain] = []
for rule in self.__rules__:
if not rule.enabled:
continue
if chain and (chain not in rule.alt):
continue
self.__cache__[chain].append(rule.fn)
def at(self, ruleName: str, fn: RuleFunc, options=None):
"""Replace rule by name with new function & options.
:param ruleName: rule name to replace.
:param fn: new rule function.
:param options: new rule options (not mandatory).
:raises: KeyError if name not found
"""
index = self.__find__(ruleName)
options = options or {}
if index == -1:
raise KeyError(f"Parser rule not found: {ruleName}")
self.__rules__[index].fn = fn
self.__rules__[index].alt = options.get("alt", [])
self.__cache__ = None
def before(self, beforeName: str, ruleName: str, fn: RuleFunc, options=None):
"""Add new rule to chain before one with given name.
:param beforeName: new rule will be added before this one.
:param ruleName: new rule will be added before this one.
:param fn: new rule function.
:param options: new rule options (not mandatory).
:raises: KeyError if name not found
"""
index = self.__find__(beforeName)
options = options or {}
if index == -1:
raise KeyError(f"Parser rule not found: {beforeName}")
self.__rules__.insert(index, Rule(ruleName, True, fn, options.get("alt", [])))
self.__cache__ = None
def after(self, afterName: str, ruleName: str, fn: RuleFunc, options=None):
"""Add new rule to chain after one with given name.
:param afterName: new rule will be added after this one.
:param ruleName: new rule will be added after this one.
:param fn: new rule function.
:param options: new rule options (not mandatory).
:raises: KeyError if name not found
"""
index = self.__find__(afterName)
options = options or {}
if index == -1:
raise KeyError(f"Parser rule not found: {afterName}")
self.__rules__.insert(
index + 1, Rule(ruleName, True, fn, options.get("alt", []))
)
self.__cache__ = None
def push(self, ruleName: str, fn: RuleFunc, options=None):
"""Push new rule to the end of chain.
:param ruleName: new rule will be added to the end of chain.
:param fn: new rule function.
:param options: new rule options (not mandatory).
"""
self.__rules__.append(Rule(ruleName, True, fn, (options or {}).get("alt", [])))
self.__cache__ = None
def enable(self, names: str | Iterable[str], ignoreInvalid: bool = False):
"""Enable rules with given names.
:param names: name or list of rule names to enable.
:param ignoreInvalid: ignore errors when rule not found
:raises: KeyError if name not found and not ignoreInvalid
:return: list of found rule names
"""
if isinstance(names, str):
names = [names]
result = []
for name in names:
idx = self.__find__(name)
if (idx < 0) and ignoreInvalid:
continue
if (idx < 0) and not ignoreInvalid:
raise KeyError(f"Rules manager: invalid rule name {name}")
self.__rules__[idx].enabled = True
result.append(name)
self.__cache__ = None
return result
def enableOnly(self, names: str | Iterable[str], ignoreInvalid: bool = False):
"""Enable rules with given names, and disable everything else.
:param names: name or list of rule names to enable.
:param ignoreInvalid: ignore errors when rule not found
:raises: KeyError if name not found and not ignoreInvalid
:return: list of found rule names
"""
if isinstance(names, str):
names = [names]
for rule in self.__rules__:
rule.enabled = False
self.enable(names, ignoreInvalid)
def disable(self, names: str | Iterable[str], ignoreInvalid: bool = False):
"""Disable rules with given names.
:param names: name or list of rule names to enable.
:param ignoreInvalid: ignore errors when rule not found
:raises: KeyError if name not found and not ignoreInvalid
:return: list of found rule names
"""
if isinstance(names, str):
names = [names]
result = []
for name in names:
idx = self.__find__(name)
if (idx < 0) and ignoreInvalid:
continue
if (idx < 0) and not ignoreInvalid:
raise KeyError(f"Rules manager: invalid rule name {name}")
self.__rules__[idx].enabled = False
result.append(name)
self.__cache__ = None
return result
def getRules(self, chainName: str) -> list[RuleFunc]:
"""Return array of active functions (rules) for given chain name.
It analyzes rules configuration, compiles caches if not exists and returns result.
Default chain name is `''` (empty string). It can't be skipped.
That's done intentionally, to keep signature monomorphic for high speed.
"""
if self.__cache__ is None:
self.__compile__()
assert self.__cache__ is not None
# Chain can be empty, if rules disabled. But we still have to return Array.
return self.__cache__.get(chainName, []) or []
def get_all_rules(self) -> list[str]:
"""Return all available rule names."""
return [r.name for r in self.__rules__]
def get_active_rules(self) -> list[str]:
"""Return the active rule names."""
return [r.name for r in self.__rules__ if r.enabled]

View File

@ -0,0 +1,27 @@
__all__ = (
"StateBlock",
"paragraph",
"heading",
"lheading",
"code",
"fence",
"hr",
"list_block",
"reference",
"blockquote",
"html_block",
"table",
)
from .blockquote import blockquote
from .code import code
from .fence import fence
from .heading import heading
from .hr import hr
from .html_block import html_block
from .lheading import lheading
from .list import list_block
from .paragraph import paragraph
from .reference import reference
from .state_block import StateBlock
from .table import table

View File

@ -0,0 +1,299 @@
# Block quotes
from __future__ import annotations
import logging
from ..common.utils import isSpace
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool):
LOGGER.debug(
"entering blockquote: %s, %s, %s, %s", state, startLine, endLine, silent
)
oldLineMax = state.lineMax
pos = state.bMarks[startLine] + state.tShift[startLine]
max = state.eMarks[startLine]
# if it's indented more than 3 spaces, it should be a code block
if (state.sCount[startLine] - state.blkIndent) >= 4:
return False
# check the block quote marker
if state.srcCharCode[pos] != 0x3E: # /* > */
return False
pos += 1
# we know that it's going to be a valid blockquote,
# so no point trying to find the end of it in silent mode
if silent:
return True
# set offset past spaces and ">"
initial = offset = state.sCount[startLine] + 1
try:
second_char_code: int | None = state.srcCharCode[pos]
except IndexError:
second_char_code = None
# skip one optional space after '>'
if second_char_code == 0x20: # /* space */
# ' > test '
# ^ -- position start of line here:
pos += 1
initial += 1
offset += 1
adjustTab = False
spaceAfterMarker = True
elif second_char_code == 0x09: # /* tab */
spaceAfterMarker = True
if (state.bsCount[startLine] + offset) % 4 == 3:
# ' >\t test '
# ^ -- position start of line here (tab has width==1)
pos += 1
initial += 1
offset += 1
adjustTab = False
else:
# ' >\t test '
# ^ -- position start of line here + shift bsCount slightly
# to make extra space appear
adjustTab = True
else:
spaceAfterMarker = False
oldBMarks = [state.bMarks[startLine]]
state.bMarks[startLine] = pos
while pos < max:
ch = state.srcCharCode[pos]
if isSpace(ch):
if ch == 0x09: # / tab /
offset += (
4
- (offset + state.bsCount[startLine] + (1 if adjustTab else 0)) % 4
)
else:
offset += 1
else:
break
pos += 1
oldBSCount = [state.bsCount[startLine]]
state.bsCount[startLine] = (
state.sCount[startLine] + 1 + (1 if spaceAfterMarker else 0)
)
lastLineEmpty = pos >= max
oldSCount = [state.sCount[startLine]]
state.sCount[startLine] = offset - initial
oldTShift = [state.tShift[startLine]]
state.tShift[startLine] = pos - state.bMarks[startLine]
terminatorRules = state.md.block.ruler.getRules("blockquote")
oldParentType = state.parentType
state.parentType = "blockquote"
# Search the end of the block
#
# Block ends with either:
# 1. an empty line outside:
# ```
# > test
#
# ```
# 2. an empty line inside:
# ```
# >
# test
# ```
# 3. another tag:
# ```
# > test
# - - -
# ```
# for (nextLine = startLine + 1; nextLine < endLine; nextLine++) {
nextLine = startLine + 1
while nextLine < endLine:
# check if it's outdented, i.e. it's inside list item and indented
# less than said list item:
#
# ```
# 1. anything
# > current blockquote
# 2. checking this line
# ```
isOutdented = state.sCount[nextLine] < state.blkIndent
pos = state.bMarks[nextLine] + state.tShift[nextLine]
max = state.eMarks[nextLine]
if pos >= max:
# Case 1: line is not inside the blockquote, and this line is empty.
break
evaluatesTrue = state.srcCharCode[pos] == 0x3E and not isOutdented # /* > */
pos += 1
if evaluatesTrue:
# This line is inside the blockquote.
# set offset past spaces and ">"
initial = offset = state.sCount[nextLine] + 1
try:
next_char: int | None = state.srcCharCode[pos]
except IndexError:
next_char = None
# skip one optional space after '>'
if next_char == 0x20: # /* space */
# ' > test '
# ^ -- position start of line here:
pos += 1
initial += 1
offset += 1
adjustTab = False
spaceAfterMarker = True
elif next_char == 0x09: # /* tab */
spaceAfterMarker = True
if (state.bsCount[nextLine] + offset) % 4 == 3:
# ' >\t test '
# ^ -- position start of line here (tab has width==1)
pos += 1
initial += 1
offset += 1
adjustTab = False
else:
# ' >\t test '
# ^ -- position start of line here + shift bsCount slightly
# to make extra space appear
adjustTab = True
else:
spaceAfterMarker = False
oldBMarks.append(state.bMarks[nextLine])
state.bMarks[nextLine] = pos
while pos < max:
ch = state.srcCharCode[pos]
if isSpace(ch):
if ch == 0x09:
offset += (
4
- (
offset
+ state.bsCount[nextLine]
+ (1 if adjustTab else 0)
)
% 4
)
else:
offset += 1
else:
break
pos += 1
lastLineEmpty = pos >= max
oldBSCount.append(state.bsCount[nextLine])
state.bsCount[nextLine] = (
state.sCount[nextLine] + 1 + (1 if spaceAfterMarker else 0)
)
oldSCount.append(state.sCount[nextLine])
state.sCount[nextLine] = offset - initial
oldTShift.append(state.tShift[nextLine])
state.tShift[nextLine] = pos - state.bMarks[nextLine]
nextLine += 1
continue
# Case 2: line is not inside the blockquote, and the last line was empty.
if lastLineEmpty:
break
# Case 3: another tag found.
terminate = False
for terminatorRule in terminatorRules:
if terminatorRule(state, nextLine, endLine, True):
terminate = True
break
if terminate:
# Quirk to enforce "hard termination mode" for paragraphs;
# normally if you call `tokenize(state, startLine, nextLine)`,
# paragraphs will look below nextLine for paragraph continuation,
# but if blockquote is terminated by another tag, they shouldn't
state.lineMax = nextLine
if state.blkIndent != 0:
# state.blkIndent was non-zero, we now set it to zero,
# so we need to re-calculate all offsets to appear as
# if indent wasn't changed
oldBMarks.append(state.bMarks[nextLine])
oldBSCount.append(state.bsCount[nextLine])
oldTShift.append(state.tShift[nextLine])
oldSCount.append(state.sCount[nextLine])
state.sCount[nextLine] -= state.blkIndent
break
oldBMarks.append(state.bMarks[nextLine])
oldBSCount.append(state.bsCount[nextLine])
oldTShift.append(state.tShift[nextLine])
oldSCount.append(state.sCount[nextLine])
# A negative indentation means that this is a paragraph continuation
#
state.sCount[nextLine] = -1
nextLine += 1
oldIndent = state.blkIndent
state.blkIndent = 0
token = state.push("blockquote_open", "blockquote", 1)
token.markup = ">"
token.map = lines = [startLine, 0]
state.md.block.tokenize(state, startLine, nextLine)
token = state.push("blockquote_close", "blockquote", -1)
token.markup = ">"
state.lineMax = oldLineMax
state.parentType = oldParentType
lines[1] = state.line
# Restore original tShift; this might not be necessary since the parser
# has already been here, but just to make sure we can do that.
for i, item in enumerate(oldTShift):
state.bMarks[i + startLine] = oldBMarks[i]
state.tShift[i + startLine] = item
state.sCount[i + startLine] = oldSCount[i]
state.bsCount[i + startLine] = oldBSCount[i]
state.blkIndent = oldIndent
return True

View File

@ -0,0 +1,36 @@
"""Code block (4 spaces padded)."""
import logging
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
def code(state: StateBlock, startLine: int, endLine: int, silent: bool = False):
LOGGER.debug("entering code: %s, %s, %s, %s", state, startLine, endLine, silent)
if state.sCount[startLine] - state.blkIndent < 4:
return False
last = nextLine = startLine + 1
while nextLine < endLine:
if state.isEmpty(nextLine):
nextLine += 1
continue
if state.sCount[nextLine] - state.blkIndent >= 4:
nextLine += 1
last = nextLine
continue
break
state.line = last
token = state.push("code_block", "code", 0)
token.content = state.getLines(startLine, last, 4 + state.blkIndent, False) + "\n"
token.map = [startLine, state.line]
return True

View File

@ -0,0 +1,104 @@
# fences (``` lang, ~~~ lang)
import logging
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
def fence(state: StateBlock, startLine: int, endLine: int, silent: bool):
LOGGER.debug("entering fence: %s, %s, %s, %s", state, startLine, endLine, silent)
haveEndMarker = False
pos = state.bMarks[startLine] + state.tShift[startLine]
maximum = state.eMarks[startLine]
# if it's indented more than 3 spaces, it should be a code block
if state.sCount[startLine] - state.blkIndent >= 4:
return False
if pos + 3 > maximum:
return False
marker = state.srcCharCode[pos]
# /* ~ */ /* ` */
if marker != 0x7E and marker != 0x60:
return False
# scan marker length
mem = pos
pos = state.skipChars(pos, marker)
length = pos - mem
if length < 3:
return False
markup = state.src[mem:pos]
params = state.src[pos:maximum]
# /* ` */
if marker == 0x60:
if chr(marker) in params:
return False
# Since start is found, we can report success here in validation mode
if silent:
return True
# search end of block
nextLine = startLine
while True:
nextLine += 1
if nextLine >= endLine:
# unclosed block should be autoclosed by end of document.
# also block seems to be autoclosed by end of parent
break
pos = mem = state.bMarks[nextLine] + state.tShift[nextLine]
maximum = state.eMarks[nextLine]
if pos < maximum and state.sCount[nextLine] < state.blkIndent:
# non-empty line with negative indent should stop the list:
# - ```
# test
break
if state.srcCharCode[pos] != marker:
continue
if state.sCount[nextLine] - state.blkIndent >= 4:
# closing fence should be indented less than 4 spaces
continue
pos = state.skipChars(pos, marker)
# closing code fence must be at least as long as the opening one
if pos - mem < length:
continue
# make sure tail has spaces only
pos = state.skipSpaces(pos)
if pos < maximum:
continue
haveEndMarker = True
# found!
break
# If a fence has heading spaces, they should be removed from its inner block
length = state.sCount[startLine]
state.line = nextLine + (1 if haveEndMarker else 0)
token = state.push("fence", "code", 0)
token.info = params
token.content = state.getLines(startLine + 1, nextLine, length, True)
token.markup = markup
token.map = [startLine, state.line]
return True

View File

@ -0,0 +1,72 @@
""" Atex heading (#, ##, ...) """
from __future__ import annotations
import logging
from ..common.utils import isSpace
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
def heading(state: StateBlock, startLine: int, endLine: int, silent: bool):
LOGGER.debug("entering heading: %s, %s, %s, %s", state, startLine, endLine, silent)
pos = state.bMarks[startLine] + state.tShift[startLine]
maximum = state.eMarks[startLine]
# if it's indented more than 3 spaces, it should be a code block
if state.sCount[startLine] - state.blkIndent >= 4:
return False
ch: int | None = state.srcCharCode[pos]
# /* # */
if ch != 0x23 or pos >= maximum:
return False
# count heading level
level = 1
pos += 1
try:
ch = state.srcCharCode[pos]
except IndexError:
ch = None
# /* # */
while ch == 0x23 and pos < maximum and level <= 6:
level += 1
pos += 1
try:
ch = state.srcCharCode[pos]
except IndexError:
ch = None
if level > 6 or (pos < maximum and not isSpace(ch)):
return False
if silent:
return True
# Let's cut tails like ' ### ' from the end of string
maximum = state.skipSpacesBack(maximum, pos)
tmp = state.skipCharsBack(maximum, 0x23, pos) # #
if tmp > pos and isSpace(state.srcCharCode[tmp - 1]):
maximum = tmp
state.line = startLine + 1
token = state.push("heading_open", "h" + str(level), 1)
token.markup = "########"[:level]
token.map = [startLine, state.line]
token = state.push("inline", "", 0)
token.content = state.src[pos:maximum].strip()
token.map = [startLine, state.line]
token.children = []
token = state.push("heading_close", "h" + str(level), -1)
token.markup = "########"[:level]
return True

View File

@ -0,0 +1,54 @@
"""Horizontal rule
At least 3 of these characters on a line * - _
"""
import logging
from ..common.utils import isSpace
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
def hr(state: StateBlock, startLine: int, endLine: int, silent: bool):
LOGGER.debug("entering hr: %s, %s, %s, %s", state, startLine, endLine, silent)
pos = state.bMarks[startLine] + state.tShift[startLine]
maximum = state.eMarks[startLine]
# if it's indented more than 3 spaces, it should be a code block
if state.sCount[startLine] - state.blkIndent >= 4:
return False
marker = state.srcCharCode[pos]
pos += 1
# Check hr marker: /* * */ /* - */ /* _ */
if marker != 0x2A and marker != 0x2D and marker != 0x5F:
return False
# markers can be mixed with spaces, but there should be at least 3 of them
cnt = 1
while pos < maximum:
ch = state.srcCharCode[pos]
pos += 1
if ch != marker and not isSpace(ch):
return False
if ch == marker:
cnt += 1
if cnt < 3:
return False
if silent:
return True
state.line = startLine + 1
token = state.push("hr", "hr", 0)
token.map = [startLine, state.line]
token.markup = chr(marker) * (cnt + 1)
return True

View File

@ -0,0 +1,91 @@
# HTML block
from __future__ import annotations
import logging
import re
from ..common.html_blocks import block_names
from ..common.html_re import HTML_OPEN_CLOSE_TAG_STR
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
# An array of opening and corresponding closing sequences for html tags,
# last argument defines whether it can terminate a paragraph or not
HTML_SEQUENCES: list[tuple[re.Pattern, re.Pattern, bool]] = [
(
re.compile(r"^<(script|pre|style|textarea)(?=(\s|>|$))", re.IGNORECASE),
re.compile(r"<\/(script|pre|style|textarea)>", re.IGNORECASE),
True,
),
(re.compile(r"^<!--"), re.compile(r"-->"), True),
(re.compile(r"^<\?"), re.compile(r"\?>"), True),
(re.compile(r"^<![A-Z]"), re.compile(r">"), True),
(re.compile(r"^<!\[CDATA\["), re.compile(r"\]\]>"), True),
(
re.compile("^</?(" + "|".join(block_names) + ")(?=(\\s|/?>|$))", re.IGNORECASE),
re.compile(r"^$"),
True,
),
(re.compile(HTML_OPEN_CLOSE_TAG_STR + "\\s*$"), re.compile(r"^$"), False),
]
def html_block(state: StateBlock, startLine: int, endLine: int, silent: bool):
LOGGER.debug(
"entering html_block: %s, %s, %s, %s", state, startLine, endLine, silent
)
pos = state.bMarks[startLine] + state.tShift[startLine]
maximum = state.eMarks[startLine]
# if it's indented more than 3 spaces, it should be a code block
if state.sCount[startLine] - state.blkIndent >= 4:
return False
if not state.md.options.get("html", None):
return False
if state.srcCharCode[pos] != 0x3C: # /* < */
return False
lineText = state.src[pos:maximum]
html_seq = None
for HTML_SEQUENCE in HTML_SEQUENCES:
if HTML_SEQUENCE[0].search(lineText):
html_seq = HTML_SEQUENCE
break
if not html_seq:
return False
if silent:
# true if this sequence can be a terminator, false otherwise
return html_seq[2]
nextLine = startLine + 1
# If we are here - we detected HTML block.
# Let's roll down till block end.
if not html_seq[1].search(lineText):
while nextLine < endLine:
if state.sCount[nextLine] < state.blkIndent:
break
pos = state.bMarks[nextLine] + state.tShift[nextLine]
maximum = state.eMarks[nextLine]
lineText = state.src[pos:maximum]
if html_seq[1].search(lineText):
if len(lineText) != 0:
nextLine += 1
break
nextLine += 1
state.line = nextLine
token = state.push("html_block", "", 0)
token.map = [startLine, nextLine]
token.content = state.getLines(startLine, nextLine, state.blkIndent, True)
return True

View File

@ -0,0 +1,90 @@
# lheading (---, ==)
import logging
from ..ruler import Ruler
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
def lheading(state: StateBlock, startLine: int, endLine: int, silent: bool):
LOGGER.debug("entering lheading: %s, %s, %s, %s", state, startLine, endLine, silent)
level = None
nextLine = startLine + 1
ruler: Ruler = state.md.block.ruler
terminatorRules = ruler.getRules("paragraph")
# if it's indented more than 3 spaces, it should be a code block
if state.sCount[startLine] - state.blkIndent >= 4:
return False
oldParentType = state.parentType
state.parentType = "paragraph" # use paragraph to match terminatorRules
# jump line-by-line until empty one or EOF
while nextLine < endLine and not state.isEmpty(nextLine):
# this would be a code block normally, but after paragraph
# it's considered a lazy continuation regardless of what's there
if state.sCount[nextLine] - state.blkIndent > 3:
nextLine += 1
continue
# Check for underline in setext header
if state.sCount[nextLine] >= state.blkIndent:
pos = state.bMarks[nextLine] + state.tShift[nextLine]
maximum = state.eMarks[nextLine]
if pos < maximum:
marker = state.srcCharCode[pos]
# /* - */ /* = */
if marker == 0x2D or marker == 0x3D:
pos = state.skipChars(pos, marker)
pos = state.skipSpaces(pos)
# /* = */
if pos >= maximum:
level = 1 if marker == 0x3D else 2
break
# quirk for blockquotes, this line should already be checked by that rule
if state.sCount[nextLine] < 0:
nextLine += 1
continue
# Some tags can terminate paragraph without empty line.
terminate = False
for terminatorRule in terminatorRules:
if terminatorRule(state, nextLine, endLine, True):
terminate = True
break
if terminate:
break
nextLine += 1
if not level:
# Didn't find valid underline
return False
content = state.getLines(startLine, nextLine, state.blkIndent, False).strip()
state.line = nextLine + 1
token = state.push("heading_open", "h" + str(level), 1)
token.markup = chr(marker)
token.map = [startLine, state.line]
token = state.push("inline", "", 0)
token.content = content
token.map = [startLine, state.line - 1]
token.children = []
token = state.push("heading_close", "h" + str(level), -1)
token.markup = chr(marker)
state.parentType = oldParentType
return True

View File

@ -0,0 +1,344 @@
# Lists
import logging
from ..common.utils import isSpace
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
# Search `[-+*][\n ]`, returns next pos after marker on success
# or -1 on fail.
def skipBulletListMarker(state: StateBlock, startLine: int):
pos = state.bMarks[startLine] + state.tShift[startLine]
maximum = state.eMarks[startLine]
marker = state.srcCharCode[pos]
pos += 1
# Check bullet /* * */ /* - */ /* + */
if marker != 0x2A and marker != 0x2D and marker != 0x2B:
return -1
if pos < maximum:
ch = state.srcCharCode[pos]
if not isSpace(ch):
# " -test " - is not a list item
return -1
return pos
# Search `\d+[.)][\n ]`, returns next pos after marker on success
# or -1 on fail.
def skipOrderedListMarker(state: StateBlock, startLine: int):
start = state.bMarks[startLine] + state.tShift[startLine]
pos = start
maximum = state.eMarks[startLine]
# List marker should have at least 2 chars (digit + dot)
if pos + 1 >= maximum:
return -1
ch = state.srcCharCode[pos]
pos += 1
# /* 0 */ /* 9 */
if ch < 0x30 or ch > 0x39:
return -1
while True:
# EOL -> fail
if pos >= maximum:
return -1
ch = state.srcCharCode[pos]
pos += 1
# /* 0 */ /* 9 */
if ch >= 0x30 and ch <= 0x39:
# List marker should have no more than 9 digits
# (prevents integer overflow in browsers)
if pos - start >= 10:
return -1
continue
# found valid marker: /* ) */ /* . */
if ch == 0x29 or ch == 0x2E:
break
return -1
if pos < maximum:
ch = state.srcCharCode[pos]
if not isSpace(ch):
# " 1.test " - is not a list item
return -1
return pos
def markTightParagraphs(state: StateBlock, idx: int):
level = state.level + 2
i = idx + 2
length = len(state.tokens) - 2
while i < length:
if state.tokens[i].level == level and state.tokens[i].type == "paragraph_open":
state.tokens[i + 2].hidden = True
state.tokens[i].hidden = True
i += 2
i += 1
def list_block(state: StateBlock, startLine: int, endLine: int, silent: bool):
LOGGER.debug("entering list: %s, %s, %s, %s", state, startLine, endLine, silent)
isTerminatingParagraph = False
tight = True
# if it's indented more than 3 spaces, it should be a code block
if state.sCount[startLine] - state.blkIndent >= 4:
return False
# Special case:
# - item 1
# - item 2
# - item 3
# - item 4
# - this one is a paragraph continuation
if (
state.listIndent >= 0
and state.sCount[startLine] - state.listIndent >= 4
and state.sCount[startLine] < state.blkIndent
):
return False
# limit conditions when list can interrupt
# a paragraph (validation mode only)
if silent and state.parentType == "paragraph":
# Next list item should still terminate previous list item
#
# This code can fail if plugins use blkIndent as well as lists,
# but I hope the spec gets fixed long before that happens.
#
if state.tShift[startLine] >= state.blkIndent:
isTerminatingParagraph = True
# Detect list type and position after marker
posAfterMarker = skipOrderedListMarker(state, startLine)
if posAfterMarker >= 0:
isOrdered = True
start = state.bMarks[startLine] + state.tShift[startLine]
markerValue = int(state.src[start : posAfterMarker - 1])
# If we're starting a new ordered list right after
# a paragraph, it should start with 1.
if isTerminatingParagraph and markerValue != 1:
return False
else:
posAfterMarker = skipBulletListMarker(state, startLine)
if posAfterMarker >= 0:
isOrdered = False
else:
return False
# If we're starting a new unordered list right after
# a paragraph, first line should not be empty.
if isTerminatingParagraph:
if state.skipSpaces(posAfterMarker) >= state.eMarks[startLine]:
return False
# We should terminate list on style change. Remember first one to compare.
markerCharCode = state.srcCharCode[posAfterMarker - 1]
# For validation mode we can terminate immediately
if silent:
return True
# Start list
listTokIdx = len(state.tokens)
if isOrdered:
token = state.push("ordered_list_open", "ol", 1)
if markerValue != 1:
token.attrs = {"start": markerValue}
else:
token = state.push("bullet_list_open", "ul", 1)
token.map = listLines = [startLine, 0]
token.markup = chr(markerCharCode)
#
# Iterate list items
#
nextLine = startLine
prevEmptyEnd = False
terminatorRules = state.md.block.ruler.getRules("list")
oldParentType = state.parentType
state.parentType = "list"
while nextLine < endLine:
pos = posAfterMarker
maximum = state.eMarks[nextLine]
initial = offset = (
state.sCount[nextLine]
+ posAfterMarker
- (state.bMarks[startLine] + state.tShift[startLine])
)
while pos < maximum:
ch = state.srcCharCode[pos]
if ch == 0x09: # \t
offset += 4 - (offset + state.bsCount[nextLine]) % 4
elif ch == 0x20: # \s
offset += 1
else:
break
pos += 1
contentStart = pos
if contentStart >= maximum:
# trimming space in "- \n 3" case, indent is 1 here
indentAfterMarker = 1
else:
indentAfterMarker = offset - initial
# If we have more than 4 spaces, the indent is 1
# (the rest is just indented code block)
if indentAfterMarker > 4:
indentAfterMarker = 1
# " - test"
# ^^^^^ - calculating total length of this thing
indent = initial + indentAfterMarker
# Run subparser & write tokens
token = state.push("list_item_open", "li", 1)
token.markup = chr(markerCharCode)
token.map = itemLines = [startLine, 0]
if isOrdered:
token.info = state.src[start : posAfterMarker - 1]
# change current state, then restore it after parser subcall
oldTight = state.tight
oldTShift = state.tShift[startLine]
oldSCount = state.sCount[startLine]
# - example list
# ^ listIndent position will be here
# ^ blkIndent position will be here
#
oldListIndent = state.listIndent
state.listIndent = state.blkIndent
state.blkIndent = indent
state.tight = True
state.tShift[startLine] = contentStart - state.bMarks[startLine]
state.sCount[startLine] = offset
if contentStart >= maximum and state.isEmpty(startLine + 1):
# workaround for this case
# (list item is empty, list terminates before "foo"):
# ~~~~~~~~
# -
#
# foo
# ~~~~~~~~
state.line = min(state.line + 2, endLine)
else:
# NOTE in list.js this was:
# state.md.block.tokenize(state, startLine, endLine, True)
# but tokeniz does not take the final parameter
state.md.block.tokenize(state, startLine, endLine)
# If any of list item is tight, mark list as tight
if (not state.tight) or prevEmptyEnd:
tight = False
# Item become loose if finish with empty line,
# but we should filter last element, because it means list finish
prevEmptyEnd = (state.line - startLine) > 1 and state.isEmpty(state.line - 1)
state.blkIndent = state.listIndent
state.listIndent = oldListIndent
state.tShift[startLine] = oldTShift
state.sCount[startLine] = oldSCount
state.tight = oldTight
token = state.push("list_item_close", "li", -1)
token.markup = chr(markerCharCode)
nextLine = startLine = state.line
itemLines[1] = nextLine
if nextLine >= endLine:
break
contentStart = state.bMarks[startLine]
#
# Try to check if list is terminated or continued.
#
if state.sCount[nextLine] < state.blkIndent:
break
# if it's indented more than 3 spaces, it should be a code block
if state.sCount[startLine] - state.blkIndent >= 4:
break
# fail if terminating block found
terminate = False
for terminatorRule in terminatorRules:
if terminatorRule(state, nextLine, endLine, True):
terminate = True
break
if terminate:
break
# fail if list has another type
if isOrdered:
posAfterMarker = skipOrderedListMarker(state, nextLine)
if posAfterMarker < 0:
break
start = state.bMarks[nextLine] + state.tShift[nextLine]
else:
posAfterMarker = skipBulletListMarker(state, nextLine)
if posAfterMarker < 0:
break
if markerCharCode != state.srcCharCode[posAfterMarker - 1]:
break
# Finalize list
if isOrdered:
token = state.push("ordered_list_close", "ol", -1)
else:
token = state.push("bullet_list_close", "ul", -1)
token.markup = chr(markerCharCode)
listLines[1] = nextLine
state.line = nextLine
state.parentType = oldParentType
# mark paragraphs tight if needed
if tight:
markTightParagraphs(state, listTokIdx)
return True

View File

@ -0,0 +1,67 @@
"""Paragraph."""
import logging
from ..ruler import Ruler
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
def paragraph(state: StateBlock, startLine: int, endLine: int, silent: bool = False):
LOGGER.debug(
"entering paragraph: %s, %s, %s, %s", state, startLine, endLine, silent
)
nextLine = startLine + 1
ruler: Ruler = state.md.block.ruler
terminatorRules = ruler.getRules("paragraph")
endLine = state.lineMax
oldParentType = state.parentType
state.parentType = "paragraph"
# jump line-by-line until empty one or EOF
while nextLine < endLine:
if state.isEmpty(nextLine):
break
# this would be a code block normally, but after paragraph
# it's considered a lazy continuation regardless of what's there
if state.sCount[nextLine] - state.blkIndent > 3:
nextLine += 1
continue
# quirk for blockquotes, this line should already be checked by that rule
if state.sCount[nextLine] < 0:
nextLine += 1
continue
# Some tags can terminate paragraph without empty line.
terminate = False
for terminatorRule in terminatorRules:
if terminatorRule(state, nextLine, endLine, True):
terminate = True
break
if terminate:
break
nextLine += 1
content = state.getLines(startLine, nextLine, state.blkIndent, False).strip()
state.line = nextLine
token = state.push("paragraph_open", "p", 1)
token.map = [startLine, state.line]
token = state.push("inline", "", 0)
token.content = content
token.map = [startLine, state.line]
token.children = []
token = state.push("paragraph_close", "p", -1)
state.parentType = oldParentType
return True

View File

@ -0,0 +1,218 @@
import logging
from ..common.utils import charCodeAt, isSpace, normalizeReference
from .state_block import StateBlock
LOGGER = logging.getLogger(__name__)
def reference(state: StateBlock, startLine, _endLine, silent):
LOGGER.debug(
"entering reference: %s, %s, %s, %s", state, startLine, _endLine, silent
)
lines = 0
pos = state.bMarks[startLine] + state.tShift[startLine]
maximum = state.eMarks[startLine]
nextLine = startLine + 1
# if it's indented more than 3 spaces, it should be a code block
if state.sCount[startLine] - state.blkIndent >= 4:
return False
if state.srcCharCode[pos] != 0x5B: # /* [ */
return False
# Simple check to quickly interrupt scan on [link](url) at the start of line.
# Can be useful on practice: https:#github.com/markdown-it/markdown-it/issues/54
while pos < maximum:
# /* ] */ /* \ */ /* : */
if state.srcCharCode[pos] == 0x5D and state.srcCharCode[pos - 1] != 0x5C:
if pos + 1 == maximum:
return False
if state.srcCharCode[pos + 1] != 0x3A:
return False
break
pos += 1
endLine = state.lineMax
# jump line-by-line until empty one or EOF
terminatorRules = state.md.block.ruler.getRules("reference")
oldParentType = state.parentType
state.parentType = "reference"
while nextLine < endLine and not state.isEmpty(nextLine):
# this would be a code block normally, but after paragraph
# it's considered a lazy continuation regardless of what's there
if state.sCount[nextLine] - state.blkIndent > 3:
nextLine += 1
continue
# quirk for blockquotes, this line should already be checked by that rule
if state.sCount[nextLine] < 0:
nextLine += 1
continue
# Some tags can terminate paragraph without empty line.
terminate = False
for terminatorRule in terminatorRules:
if terminatorRule(state, nextLine, endLine, True):
terminate = True
break
if terminate:
break
nextLine += 1
string = state.getLines(startLine, nextLine, state.blkIndent, False).strip()
maximum = len(string)
labelEnd = None
pos = 1
while pos < maximum:
ch = charCodeAt(string, pos)
if ch == 0x5B: # /* [ */
return False
elif ch == 0x5D: # /* ] */
labelEnd = pos
break
elif ch == 0x0A: # /* \n */
lines += 1
elif ch == 0x5C: # /* \ */
pos += 1
if pos < maximum and charCodeAt(string, pos) == 0x0A:
lines += 1
pos += 1
if (
labelEnd is None or labelEnd < 0 or charCodeAt(string, labelEnd + 1) != 0x3A
): # /* : */
return False
# [label]: destination 'title'
# ^^^ skip optional whitespace here
pos = labelEnd + 2
while pos < maximum:
ch = charCodeAt(string, pos)
if ch == 0x0A:
lines += 1
elif isSpace(ch):
pass
else:
break
pos += 1
# [label]: destination 'title'
# ^^^^^^^^^^^ parse this
res = state.md.helpers.parseLinkDestination(string, pos, maximum)
if not res.ok:
return False
href = state.md.normalizeLink(res.str)
if not state.md.validateLink(href):
return False
pos = res.pos
lines += res.lines
# save cursor state, we could require to rollback later
destEndPos = pos
destEndLineNo = lines
# [label]: destination 'title'
# ^^^ skipping those spaces
start = pos
while pos < maximum:
ch = charCodeAt(string, pos)
if ch == 0x0A:
lines += 1
elif isSpace(ch):
pass
else:
break
pos += 1
# [label]: destination 'title'
# ^^^^^^^ parse this
res = state.md.helpers.parseLinkTitle(string, pos, maximum)
if pos < maximum and start != pos and res.ok:
title = res.str
pos = res.pos
lines += res.lines
else:
title = ""
pos = destEndPos
lines = destEndLineNo
# skip trailing spaces until the rest of the line
while pos < maximum:
ch = charCodeAt(string, pos)
if not isSpace(ch):
break
pos += 1
if pos < maximum and charCodeAt(string, pos) != 0x0A:
if title:
# garbage at the end of the line after title,
# but it could still be a valid reference if we roll back
title = ""
pos = destEndPos
lines = destEndLineNo
while pos < maximum:
ch = charCodeAt(string, pos)
if not isSpace(ch):
break
pos += 1
if pos < maximum and charCodeAt(string, pos) != 0x0A:
# garbage at the end of the line
return False
label = normalizeReference(string[1:labelEnd])
if not label:
# CommonMark 0.20 disallows empty labels
return False
# Reference can not terminate anything. This check is for safety only.
if silent:
return True
if "references" not in state.env:
state.env["references"] = {}
state.line = startLine + lines + 1
# note, this is not part of markdown-it JS, but is useful for renderers
if state.md.options.get("inline_definitions", False):
token = state.push("definition", "", 0)
token.meta = {
"id": label,
"title": title,
"url": href,
"label": string[1:labelEnd],
}
token.map = [startLine, state.line]
if label not in state.env["references"]:
state.env["references"][label] = {
"title": title,
"href": href,
"map": [startLine, state.line],
}
else:
state.env.setdefault("duplicate_refs", []).append(
{
"title": title,
"href": href,
"label": label,
"map": [startLine, state.line],
}
)
state.parentType = oldParentType
return True

View File

@ -0,0 +1,230 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from ..common.utils import isSpace
from ..ruler import StateBase
from ..token import Token
if TYPE_CHECKING:
from markdown_it.main import MarkdownIt
class StateBlock(StateBase):
def __init__(
self,
src: str,
md: MarkdownIt,
env,
tokens: list[Token],
srcCharCode: tuple[int, ...] | None = None,
):
if srcCharCode is not None:
self._src = src
self.srcCharCode = srcCharCode
else:
self.src = src
# link to parser instance
self.md = md
self.env = env
#
# Internal state variables
#
self.tokens = tokens
self.bMarks = [] # line begin offsets for fast jumps
self.eMarks = [] # line end offsets for fast jumps
# offsets of the first non-space characters (tabs not expanded)
self.tShift = []
self.sCount = [] # indents for each line (tabs expanded)
# An amount of virtual spaces (tabs expanded) between beginning
# of each line (bMarks) and real beginning of that line.
#
# It exists only as a hack because blockquotes override bMarks
# losing information in the process.
#
# It's used only when expanding tabs, you can think about it as
# an initial tab length, e.g. bsCount=21 applied to string `\t123`
# means first tab should be expanded to 4-21%4 === 3 spaces.
#
self.bsCount = []
# block parser variables
self.blkIndent = 0 # required block content indent (for example, if we are
# inside a list, it would be positioned after list marker)
self.line = 0 # line index in src
self.lineMax = 0 # lines count
self.tight = False # loose/tight mode for lists
self.ddIndent = -1 # indent of the current dd block (-1 if there isn't any)
self.listIndent = -1 # indent of the current list block (-1 if there isn't any)
# can be 'blockquote', 'list', 'root', 'paragraph' or 'reference'
# used in lists to determine if they interrupt a paragraph
self.parentType = "root"
self.level = 0
# renderer
self.result = ""
# Create caches
# Generate markers.
indent_found = False
start = pos = indent = offset = 0
length = len(self.src)
for pos, character in enumerate(self.srcCharCode):
if not indent_found:
if isSpace(character):
indent += 1
if character == 0x09:
offset += 4 - offset % 4
else:
offset += 1
continue
else:
indent_found = True
if character == 0x0A or pos == length - 1:
if character != 0x0A:
pos += 1
self.bMarks.append(start)
self.eMarks.append(pos)
self.tShift.append(indent)
self.sCount.append(offset)
self.bsCount.append(0)
indent_found = False
indent = 0
offset = 0
start = pos + 1
# Push fake entry to simplify cache bounds checks
self.bMarks.append(length)
self.eMarks.append(length)
self.tShift.append(0)
self.sCount.append(0)
self.bsCount.append(0)
self.lineMax = len(self.bMarks) - 1 # don't count last fake line
def __repr__(self):
return (
f"{self.__class__.__name__}"
f"(line={self.line},level={self.level},tokens={len(self.tokens)})"
)
def push(self, ttype: str, tag: str, nesting: int) -> Token:
"""Push new token to "stream"."""
token = Token(ttype, tag, nesting)
token.block = True
if nesting < 0:
self.level -= 1 # closing tag
token.level = self.level
if nesting > 0:
self.level += 1 # opening tag
self.tokens.append(token)
return token
def isEmpty(self, line: int) -> bool:
"""."""
return (self.bMarks[line] + self.tShift[line]) >= self.eMarks[line]
def skipEmptyLines(self, from_pos: int) -> int:
"""."""
while from_pos < self.lineMax:
try:
if (self.bMarks[from_pos] + self.tShift[from_pos]) < self.eMarks[
from_pos
]:
break
except IndexError:
pass
from_pos += 1
return from_pos
def skipSpaces(self, pos: int) -> int:
"""Skip spaces from given position."""
while pos < len(self.src):
if not isSpace(self.srcCharCode[pos]):
break
pos += 1
return pos
def skipSpacesBack(self, pos: int, minimum: int) -> int:
"""Skip spaces from given position in reverse."""
if pos <= minimum:
return pos
while pos > minimum:
pos -= 1
if not isSpace(self.srcCharCode[pos]):
return pos + 1
return pos
def skipChars(self, pos: int, code: int) -> int:
"""Skip char codes from given position."""
while pos < len(self.src):
if self.srcCharCode[pos] != code:
break
pos += 1
return pos
def skipCharsBack(self, pos: int, code: int, minimum: int) -> int:
"""Skip char codes reverse from given position - 1."""
if pos <= minimum:
return pos
while pos > minimum:
pos -= 1
if code != self.srcCharCode[pos]:
return pos + 1
return pos
def getLines(self, begin: int, end: int, indent: int, keepLastLF: bool) -> str:
"""Cut lines range from source."""
line = begin
if begin >= end:
return ""
queue = [""] * (end - begin)
i = 1
while line < end:
lineIndent = 0
lineStart = first = self.bMarks[line]
if line + 1 < end or keepLastLF:
last = self.eMarks[line] + 1
else:
last = self.eMarks[line]
while (first < last) and (lineIndent < indent):
ch = self.srcCharCode[first]
if isSpace(ch):
if ch == 0x09:
lineIndent += 4 - (lineIndent + self.bsCount[line]) % 4
else:
lineIndent += 1
elif first - lineStart < self.tShift[line]:
lineIndent += 1
else:
break
first += 1
if lineIndent > indent:
# partially expanding tabs in code blocks, e.g '\t\tfoobar'
# with indent=2 becomes ' \tfoobar'
queue[i - 1] = (" " * (lineIndent - indent)) + self.src[first:last]
else:
queue[i - 1] = self.src[first:last]
line += 1
i += 1
return "".join(queue)

View File

@ -0,0 +1,238 @@
# GFM table, https://github.github.com/gfm/#tables-extension-
import re
from ..common.utils import charCodeAt, isSpace
from .state_block import StateBlock
headerLineRe = re.compile(r"^:?-+:?$")
enclosingPipesRe = re.compile(r"^\||\|$")
def getLine(state: StateBlock, line: int):
pos = state.bMarks[line] + state.tShift[line]
maximum = state.eMarks[line]
# return state.src.substr(pos, max - pos)
return state.src[pos:maximum]
def escapedSplit(string):
result = []
pos = 0
max = len(string)
isEscaped = False
lastPos = 0
current = ""
ch = charCodeAt(string, pos)
while pos < max:
if ch == 0x7C: # /* | */
if not isEscaped:
# pipe separating cells, '|'
result.append(current + string[lastPos:pos])
current = ""
lastPos = pos + 1
else:
# escaped pipe, '\|'
current += string[lastPos : pos - 1]
lastPos = pos
isEscaped = ch == 0x5C # /* \ */
pos += 1
ch = charCodeAt(string, pos)
result.append(current + string[lastPos:])
return result
def table(state: StateBlock, startLine: int, endLine: int, silent: bool):
tbodyLines = None
# should have at least two lines
if startLine + 2 > endLine:
return False
nextLine = startLine + 1
if state.sCount[nextLine] < state.blkIndent:
return False
# if it's indented more than 3 spaces, it should be a code block
if state.sCount[nextLine] - state.blkIndent >= 4:
return False
# first character of the second line should be '|', '-', ':',
# and no other characters are allowed but spaces;
# basically, this is the equivalent of /^[-:|][-:|\s]*$/ regexp
pos = state.bMarks[nextLine] + state.tShift[nextLine]
if pos >= state.eMarks[nextLine]:
return False
first_ch = state.srcCharCode[pos]
pos += 1
if first_ch not in {0x7C, 0x2D, 0x3A}: # not in {"|", "-", ":"}
return False
if pos >= state.eMarks[nextLine]:
return False
second_ch = state.srcCharCode[pos]
pos += 1
# not in {"|", "-", ":"} and not space
if second_ch not in {0x7C, 0x2D, 0x3A} and not isSpace(second_ch):
return False
# if first character is '-', then second character must not be a space
# (due to parsing ambiguity with list)
if first_ch == 0x2D and isSpace(second_ch):
return False
while pos < state.eMarks[nextLine]:
ch = state.srcCharCode[pos]
# /* | */ /* - */ /* : */
if ch not in {0x7C, 0x2D, 0x3A} and not isSpace(ch):
return False
pos += 1
lineText = getLine(state, startLine + 1)
columns = lineText.split("|")
aligns = []
for i in range(len(columns)):
t = columns[i].strip()
if not t:
# allow empty columns before and after table, but not in between columns;
# e.g. allow ` |---| `, disallow ` ---||--- `
if i == 0 or i == len(columns) - 1:
continue
else:
return False
if not headerLineRe.search(t):
return False
if charCodeAt(t, len(t) - 1) == 0x3A: # /* : */
# /* : */
aligns.append("center" if charCodeAt(t, 0) == 0x3A else "right")
elif charCodeAt(t, 0) == 0x3A: # /* : */
aligns.append("left")
else:
aligns.append("")
lineText = getLine(state, startLine).strip()
if "|" not in lineText:
return False
if state.sCount[startLine] - state.blkIndent >= 4:
return False
columns = escapedSplit(lineText)
if columns and columns[0] == "":
columns.pop(0)
if columns and columns[-1] == "":
columns.pop()
# header row will define an amount of columns in the entire table,
# and align row should be exactly the same (the rest of the rows can differ)
columnCount = len(columns)
if columnCount == 0 or columnCount != len(aligns):
return False
if silent:
return True
oldParentType = state.parentType
state.parentType = "table"
# use 'blockquote' lists for termination because it's
# the most similar to tables
terminatorRules = state.md.block.ruler.getRules("blockquote")
token = state.push("table_open", "table", 1)
token.map = tableLines = [startLine, 0]
token = state.push("thead_open", "thead", 1)
token.map = [startLine, startLine + 1]
token = state.push("tr_open", "tr", 1)
token.map = [startLine, startLine + 1]
for i in range(len(columns)):
token = state.push("th_open", "th", 1)
if aligns[i]:
token.attrs = {"style": "text-align:" + aligns[i]}
token = state.push("inline", "", 0)
# note in markdown-it this map was removed in v12.0.0 however, we keep it,
# since it is helpful to propagate to children tokens
token.map = [startLine, startLine + 1]
token.content = columns[i].strip()
token.children = []
token = state.push("th_close", "th", -1)
token = state.push("tr_close", "tr", -1)
token = state.push("thead_close", "thead", -1)
nextLine = startLine + 2
while nextLine < endLine:
if state.sCount[nextLine] < state.blkIndent:
break
terminate = False
for i in range(len(terminatorRules)):
if terminatorRules[i](state, nextLine, endLine, True):
terminate = True
break
if terminate:
break
lineText = getLine(state, nextLine).strip()
if not lineText:
break
if state.sCount[nextLine] - state.blkIndent >= 4:
break
columns = escapedSplit(lineText)
if columns and columns[0] == "":
columns.pop(0)
if columns and columns[-1] == "":
columns.pop()
if nextLine == startLine + 2:
token = state.push("tbody_open", "tbody", 1)
token.map = tbodyLines = [startLine + 2, 0]
token = state.push("tr_open", "tr", 1)
token.map = [nextLine, nextLine + 1]
for i in range(columnCount):
token = state.push("td_open", "td", 1)
if aligns[i]:
token.attrs = {"style": "text-align:" + aligns[i]}
token = state.push("inline", "", 0)
# note in markdown-it this map was removed in v12.0.0 however, we keep it,
# since it is helpful to propagate to children tokens
token.map = [nextLine, nextLine + 1]
try:
token.content = columns[i].strip() if columns[i] else ""
except IndexError:
token.content = ""
token.children = []
token = state.push("td_close", "td", -1)
token = state.push("tr_close", "tr", -1)
nextLine += 1
if tbodyLines:
token = state.push("tbody_close", "tbody", -1)
tbodyLines[1] = nextLine
token = state.push("table_close", "table", -1)
tableLines[1] = nextLine
state.parentType = oldParentType
state.line = nextLine
return True

View File

@ -0,0 +1,17 @@
__all__ = (
"StateCore",
"normalize",
"block",
"inline",
"replace",
"smartquotes",
"linkify",
)
from .block import block
from .inline import inline
from .linkify import linkify
from .normalize import normalize
from .replacements import replace
from .smartquotes import smartquotes
from .state_core import StateCore

View File

@ -0,0 +1,16 @@
from ..token import Token
from .state_core import StateCore
def block(state: StateCore) -> None:
if state.inlineMode:
token = Token("inline", "", 0)
token.content = state.src
token.map = [0, 1]
token.children = []
state.tokens.append(token)
else:
state.md.block.parse(
state.src, state.md, state.env, state.tokens, state.srcCharCode
)

View File

@ -0,0 +1,10 @@
from .state_core import StateCore
def inline(state: StateCore) -> None:
"""Parse inlines"""
for token in state.tokens:
if token.type == "inline":
if token.children is None:
token.children = []
state.md.inline.parse(token.content, state.md, state.env, token.children)

View File

@ -0,0 +1,141 @@
import re
from ..common.utils import arrayReplaceAt
from ..token import Token
from .state_core import StateCore
LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
HTTP_RE = re.compile(r"^http://")
MAILTO_RE = re.compile(r"^mailto:")
TEST_MAILTO_RE = re.compile(r"^mailto:", flags=re.IGNORECASE)
def isLinkOpen(string: str) -> bool:
return bool(LINK_OPEN_RE.search(string))
def isLinkClose(string: str) -> bool:
return bool(LINK_CLOSE_RE.search(string))
def linkify(state: StateCore) -> None:
blockTokens = state.tokens
if not state.md.options.linkify:
return
if not state.md.linkify:
raise ModuleNotFoundError("Linkify enabled but not installed.")
for j in range(len(blockTokens)):
if blockTokens[j].type != "inline" or not state.md.linkify.pretest(
blockTokens[j].content
):
continue
tokens = blockTokens[j].children
htmlLinkLevel = 0
# We scan from the end, to keep position when new tags added.
# Use reversed logic in links start/end match
assert tokens is not None
i = len(tokens)
while i >= 1:
i -= 1
assert isinstance(tokens, list)
currentToken = tokens[i]
# Skip content of markdown links
if currentToken.type == "link_close":
i -= 1
while (
tokens[i].level != currentToken.level
and tokens[i].type != "link_open"
):
i -= 1
continue
# Skip content of html tag links
if currentToken.type == "html_inline":
if isLinkOpen(currentToken.content) and htmlLinkLevel > 0:
htmlLinkLevel -= 1
if isLinkClose(currentToken.content):
htmlLinkLevel += 1
if htmlLinkLevel > 0:
continue
if currentToken.type == "text" and state.md.linkify.test(
currentToken.content
):
text = currentToken.content
links = state.md.linkify.match(text)
# Now split string to nodes
nodes = []
level = currentToken.level
lastPos = 0
for ln in range(len(links)):
url = links[ln].url
fullUrl = state.md.normalizeLink(url)
if not state.md.validateLink(fullUrl):
continue
urlText = links[ln].text
# Linkifier might send raw hostnames like "example.com", where url
# starts with domain name. So we prepend http:// in those cases,
# and remove it afterwards.
if not links[ln].schema:
urlText = HTTP_RE.sub(
"", state.md.normalizeLinkText("http://" + urlText)
)
elif links[ln].schema == "mailto:" and TEST_MAILTO_RE.search(
urlText
):
urlText = MAILTO_RE.sub(
"", state.md.normalizeLinkText("mailto:" + urlText)
)
else:
urlText = state.md.normalizeLinkText(urlText)
pos = links[ln].index
if pos > lastPos:
token = Token("text", "", 0)
token.content = text[lastPos:pos]
token.level = level
nodes.append(token)
token = Token("link_open", "a", 1)
token.attrs = {"href": fullUrl}
token.level = level
level += 1
token.markup = "linkify"
token.info = "auto"
nodes.append(token)
token = Token("text", "", 0)
token.content = urlText
token.level = level
nodes.append(token)
token = Token("link_close", "a", -1)
level -= 1
token.level = level
token.markup = "linkify"
token.info = "auto"
nodes.append(token)
lastPos = links[ln].last_index
if lastPos < len(text):
token = Token("text", "", 0)
token.content = text[lastPos:]
token.level = level
nodes.append(token)
blockTokens[j].children = tokens = arrayReplaceAt(tokens, i, nodes)

View File

@ -0,0 +1,19 @@
"""Normalize input string."""
import re
from .state_core import StateCore
# https://spec.commonmark.org/0.29/#line-ending
NEWLINES_RE = re.compile(r"\r\n?|\n")
NULL_RE = re.compile(r"\0")
def normalize(state: StateCore) -> None:
# Normalize newlines
string = NEWLINES_RE.sub("\n", state.src)
# Replace NULL characters
string = NULL_RE.sub("\uFFFD", string)
state.src = string

View File

@ -0,0 +1,125 @@
"""Simple typographic replacements
* ``(c)``, ``(C)`` ©
* ``(tm)``, ``(TM)``
* ``(r)``, ``(R)`` ®
* ``(p)``, ``(P)`` §
* ``+-`` ±
* ``...``
* ``?....`` ?..
* ``!....`` !..
* ``????????`` ???
* ``!!!!!`` !!!
* ``,,,`` ,
* ``--`` &ndash
* ``---`` &mdash
"""
from __future__ import annotations
import logging
import re
from ..token import Token
from .state_core import StateCore
LOGGER = logging.getLogger(__name__)
# TODO:
# - fractionals 1/2, 1/4, 3/4 -> ½, ¼, ¾
# - miltiplication 2 x 4 -> 2 × 4
RARE_RE = re.compile(r"\+-|\.\.|\?\?\?\?|!!!!|,,|--")
# Workaround for phantomjs - need regex without /g flag,
# or root check will fail every second time
# SCOPED_ABBR_TEST_RE = r"\((c|tm|r|p)\)"
SCOPED_ABBR_RE = re.compile(r"\((c|tm|r|p)\)", flags=re.IGNORECASE)
PLUS_MINUS_RE = re.compile(r"\+-")
ELLIPSIS_RE = re.compile(r"\.{2,}")
ELLIPSIS_QUESTION_EXCLAMATION_RE = re.compile(r"([?!])…")
QUESTION_EXCLAMATION_RE = re.compile(r"([?!]){4,}")
COMMA_RE = re.compile(r",{2,}")
EM_DASH_RE = re.compile(r"(^|[^-])---(?=[^-]|$)", flags=re.MULTILINE)
EN_DASH_RE = re.compile(r"(^|\s)--(?=\s|$)", flags=re.MULTILINE)
EN_DASH_INDENT_RE = re.compile(r"(^|[^-\s])--(?=[^-\s]|$)", flags=re.MULTILINE)
SCOPED_ABBR = {"c": "©", "r": "®", "p": "§", "tm": ""}
def replaceFn(match: re.Match[str]):
return SCOPED_ABBR[match.group(1).lower()]
def replace_scoped(inlineTokens: list[Token]) -> None:
inside_autolink = 0
for token in inlineTokens:
if token.type == "text" and not inside_autolink:
token.content = SCOPED_ABBR_RE.sub(replaceFn, token.content)
if token.type == "link_open" and token.info == "auto":
inside_autolink -= 1
if token.type == "link_close" and token.info == "auto":
inside_autolink += 1
def replace_rare(inlineTokens: list[Token]) -> None:
inside_autolink = 0
for token in inlineTokens:
if token.type == "text" and not inside_autolink:
if RARE_RE.search(token.content):
# +- -> ±
token.content = PLUS_MINUS_RE.sub("±", token.content)
# .., ..., ....... -> …
token.content = ELLIPSIS_RE.sub("", token.content)
# but ?..... & !..... -> ?.. & !..
token.content = ELLIPSIS_QUESTION_EXCLAMATION_RE.sub(
"\\1..", token.content
)
token.content = QUESTION_EXCLAMATION_RE.sub("\\1\\1\\1", token.content)
# ,, ,,, ,,,, -> ,
token.content = COMMA_RE.sub(",", token.content)
# em-dash
token.content = EM_DASH_RE.sub("\\1\u2014", token.content)
# en-dash
token.content = EN_DASH_RE.sub("\\1\u2013", token.content)
token.content = EN_DASH_INDENT_RE.sub("\\1\u2013", token.content)
if token.type == "link_open" and token.info == "auto":
inside_autolink -= 1
if token.type == "link_close" and token.info == "auto":
inside_autolink += 1
def replace(state: StateCore) -> None:
if not state.md.options.typographer:
return
for token in state.tokens:
if token.type != "inline":
continue
assert token.children is not None
if SCOPED_ABBR_RE.search(token.content):
replace_scoped(token.children)
if RARE_RE.search(token.content):
replace_rare(token.children)

View File

@ -0,0 +1,202 @@
"""Convert straight quotation marks to typographic ones
"""
from __future__ import annotations
import re
from typing import Any
from ..common.utils import charCodeAt, isMdAsciiPunct, isPunctChar, isWhiteSpace
from ..token import Token
from .state_core import StateCore
QUOTE_TEST_RE = re.compile(r"['\"]")
QUOTE_RE = re.compile(r"['\"]")
APOSTROPHE = "\u2019" #
def replaceAt(string: str, index: int, ch: str) -> str:
# When the index is negative, the behavior is different from the js version.
# But basically, the index will not be negative.
assert index >= 0
return string[:index] + ch + string[index + 1 :]
def process_inlines(tokens: list[Token], state: StateCore) -> None:
stack: list[dict[str, Any]] = []
for i in range(len(tokens)):
token = tokens[i]
thisLevel = token.level
j = 0
for j in range(len(stack))[::-1]:
if stack[j]["level"] <= thisLevel:
break
else:
# When the loop is terminated without a "break".
# Subtract 1 to get the same index as the js version.
j -= 1
stack = stack[: j + 1]
if token.type != "text":
continue
text = token.content
pos = 0
maximum = len(text)
while pos < maximum:
goto_outer = False
lastIndex = pos
t = QUOTE_RE.search(text[lastIndex:])
if not t:
break
canOpen = canClose = True
pos = t.start(0) + lastIndex + 1
isSingle = t.group(0) == "'"
# Find previous character,
# default to space if it's the beginning of the line
lastChar = 0x20
if t.start(0) + lastIndex - 1 >= 0:
lastChar = charCodeAt(text, t.start(0) + lastIndex - 1)
else:
for j in range(i)[::-1]:
# lastChar defaults to 0x20
if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak":
break
# should skip all tokens except 'text', 'html_inline' or 'code_inline'
if not tokens[j].content:
continue
lastChar = charCodeAt(tokens[j].content, len(tokens[j].content) - 1)
break
# Find next character,
# default to space if it's the end of the line
nextChar = 0x20
if pos < maximum:
nextChar = charCodeAt(text, pos)
else:
for j in range(i + 1, len(tokens)):
# nextChar defaults to 0x20
if tokens[j].type == "softbreak" or tokens[j].type == "hardbreak":
break
# should skip all tokens except 'text', 'html_inline' or 'code_inline'
if not tokens[j].content:
continue
nextChar = charCodeAt(tokens[j].content, 0)
break
isLastPunctChar = isMdAsciiPunct(lastChar) or isPunctChar(chr(lastChar))
isNextPunctChar = isMdAsciiPunct(nextChar) or isPunctChar(chr(nextChar))
isLastWhiteSpace = isWhiteSpace(lastChar)
isNextWhiteSpace = isWhiteSpace(nextChar)
if isNextWhiteSpace:
canOpen = False
elif isNextPunctChar:
if not (isLastWhiteSpace or isLastPunctChar):
canOpen = False
if isLastWhiteSpace:
canClose = False
elif isLastPunctChar:
if not (isNextWhiteSpace or isNextPunctChar):
canClose = False
if nextChar == 0x22 and t.group(0) == '"': # 0x22: "
if lastChar >= 0x30 and lastChar <= 0x39: # 0x30: 0, 0x39: 9
# special case: 1"" - count first quote as an inch
canClose = canOpen = False
if canOpen and canClose:
# Replace quotes in the middle of punctuation sequence, but not
# in the middle of the words, i.e.:
#
# 1. foo " bar " baz - not replaced
# 2. foo-"-bar-"-baz - replaced
# 3. foo"bar"baz - not replaced
canOpen = isLastPunctChar
canClose = isNextPunctChar
if not canOpen and not canClose:
# middle of word
if isSingle:
token.content = replaceAt(
token.content, t.start(0) + lastIndex, APOSTROPHE
)
continue
if canClose:
# this could be a closing quote, rewind the stack to get a match
for j in range(len(stack))[::-1]:
item = stack[j]
if stack[j]["level"] < thisLevel:
break
if item["single"] == isSingle and stack[j]["level"] == thisLevel:
item = stack[j]
if isSingle:
openQuote = state.md.options.quotes[2]
closeQuote = state.md.options.quotes[3]
else:
openQuote = state.md.options.quotes[0]
closeQuote = state.md.options.quotes[1]
# replace token.content *before* tokens[item.token].content,
# because, if they are pointing at the same token, replaceAt
# could mess up indices when quote length != 1
token.content = replaceAt(
token.content, t.start(0) + lastIndex, closeQuote
)
tokens[item["token"]].content = replaceAt(
tokens[item["token"]].content, item["pos"], openQuote
)
pos += len(closeQuote) - 1
if item["token"] == i:
pos += len(openQuote) - 1
text = token.content
maximum = len(text)
stack = stack[:j]
goto_outer = True
break
if goto_outer:
goto_outer = False
continue
if canOpen:
stack.append(
{
"token": i,
"pos": t.start(0) + lastIndex,
"single": isSingle,
"level": thisLevel,
}
)
elif canClose and isSingle:
token.content = replaceAt(
token.content, t.start(0) + lastIndex, APOSTROPHE
)
def smartquotes(state: StateCore) -> None:
if not state.md.options.typographer:
return
for token in state.tokens:
if token.type != "inline" or not QUOTE_RE.search(token.content):
continue
assert token.children is not None
process_inlines(token.children, state)

View File

@ -0,0 +1,25 @@
from __future__ import annotations
from collections.abc import MutableMapping
from typing import TYPE_CHECKING
from ..ruler import StateBase
from ..token import Token
if TYPE_CHECKING:
from markdown_it import MarkdownIt
class StateCore(StateBase):
def __init__(
self,
src: str,
md: MarkdownIt,
env: MutableMapping,
tokens: list[Token] | None = None,
):
self.src = src
self.md = md # link to parser instance
self.env = env
self.tokens: list[Token] = tokens or []
self.inlineMode = False

View File

@ -0,0 +1,29 @@
__all__ = (
"StateInline",
"text",
"text_collapse",
"link_pairs",
"escape",
"newline",
"backtick",
"emphasis",
"image",
"link",
"autolink",
"entity",
"html_inline",
"strikethrough",
)
from . import emphasis, strikethrough
from .autolink import autolink
from .backticks import backtick
from .balance_pairs import link_pairs
from .entity import entity
from .escape import escape
from .html_inline import html_inline
from .image import image
from .link import link
from .newline import newline
from .state_inline import StateInline
from .text import text
from .text_collapse import text_collapse

Some files were not shown because too many files have changed in this diff Show More