From 6741c12f43b3daac61e2dd5ac56d36fffe5e241a Mon Sep 17 00:00:00 2001 From: Mathieu Duponchelle Date: Fri, 6 Jan 2017 17:39:59 +0100 Subject: [PATCH 1/5] Implement and expose reference nodes. Added cmark_node_get_label(node). Added cmark_node_set_label(node). API change. --- api_test/main.c | 16 ++++++++++++++- src/blocks.c | 2 +- src/cmark.h | 21 +++++++++++++++----- src/commonmark.c | 3 +++ src/html.c | 3 +++ src/inlines.c | 32 +++++++++++++++++++++++------- src/inlines.h | 3 ++- src/latex.c | 3 +++ src/man.c | 1 + src/node.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ src/node.h | 7 +++++++ src/references.c | 8 ++++---- src/references.h | 5 +++-- src/xml.c | 11 +++++++++++ 14 files changed, 145 insertions(+), 21 deletions(-) diff --git a/api_test/main.c b/api_test/main.c index 17e1582b4..133728700 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -90,7 +90,10 @@ static void accessors(test_batch_runner *runner) { "\n" "
html
\n" "\n" - "[link](url 'title')\n"; + "[link](url 'title')\n" + "\n" + "[foo]: /bar 'title'\n" + "\n"; cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1, CMARK_OPT_DEFAULT); @@ -140,6 +143,11 @@ static void accessors(test_batch_runner *runner) { cmark_node *string = cmark_node_first_child(link); STR_EQ(runner, cmark_node_get_literal(string), "link", "get_literal string"); + cmark_node *reference = cmark_node_next(paragraph); + STR_EQ(runner, cmark_node_get_url(reference), "/bar", "get_reference_url"); + STR_EQ(runner, cmark_node_get_title(reference), "title", "get_reference_title"); + STR_EQ(runner, cmark_node_get_label(reference), "foo", "get_reference_label"); + // Setters OK(runner, cmark_node_set_heading_level(heading, 3), "set_heading_level"); @@ -169,6 +177,10 @@ static void accessors(test_batch_runner *runner) { OK(runner, cmark_node_set_url(link, "URL"), "set_url"); OK(runner, cmark_node_set_title(link, "TITLE"), "set_title"); + OK(runner, cmark_node_set_url(reference, "URL"), "set_reference_url"); + OK(runner, cmark_node_set_title(reference, "TITLE"), "set_reference_title"); + OK(runner, cmark_node_set_label(reference, "LABEL"), "set_reference_label"); + OK(runner, cmark_node_set_literal(string, "prefix-LINK"), "set_literal string"); @@ -214,6 +226,7 @@ static void accessors(test_batch_runner *runner) { "get_fence_info error"); OK(runner, cmark_node_get_url(html) == NULL, "get_url error"); OK(runner, cmark_node_get_title(heading) == NULL, "get_title error"); + OK(runner, cmark_node_get_label(link) == NULL, "get_label error"); // Setter errors @@ -229,6 +242,7 @@ static void accessors(test_batch_runner *runner) { "set_fence_info error"); OK(runner, !cmark_node_set_url(html, "url"), "set_url error"); OK(runner, !cmark_node_set_title(heading, "title"), "set_title error"); + OK(runner, !cmark_node_set_label(link, "label"), "set_label error"); OK(runner, !cmark_node_set_heading_level(heading, 0), "set_heading_level too small"); diff --git a/src/blocks.c b/src/blocks.c index 99dd08265..078caa893 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -257,7 +257,7 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) { case CMARK_NODE_PARAGRAPH: while (cmark_strbuf_at(node_content, 0) == '[' && (pos = cmark_parse_reference_inline(parser->mem, node_content, - parser->refmap))) { + parser->refmap, parser->root))) { cmark_strbuf_drop(node_content, pos); } diff --git a/src/cmark.h b/src/cmark.h index 6ed7eb057..7f7daaed9 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -45,9 +45,10 @@ typedef enum { CMARK_NODE_PARAGRAPH, CMARK_NODE_HEADING, CMARK_NODE_THEMATIC_BREAK, + CMARK_NODE_REFERENCE, CMARK_NODE_FIRST_BLOCK = CMARK_NODE_DOCUMENT, - CMARK_NODE_LAST_BLOCK = CMARK_NODE_THEMATIC_BREAK, + CMARK_NODE_LAST_BLOCK = CMARK_NODE_REFERENCE, /* Inline */ CMARK_NODE_TEXT, @@ -333,26 +334,36 @@ CMARK_EXPORT const char *cmark_node_get_fence_info(cmark_node *node); */ CMARK_EXPORT int cmark_node_set_fence_info(cmark_node *node, const char *info); -/** Returns the URL of a link or image 'node', or an empty string +/** Returns the URL of a link, image or reference 'node', or an empty string if no URL is set. */ CMARK_EXPORT const char *cmark_node_get_url(cmark_node *node); -/** Sets the URL of a link or image 'node'. Returns 1 on success, +/** Sets the URL of a link, image or reference 'node'. Returns 1 on success, * 0 on failure. */ CMARK_EXPORT int cmark_node_set_url(cmark_node *node, const char *url); -/** Returns the title of a link or image 'node', or an empty +/** Returns the title of a link, image or reference 'node', or an empty string if no title is set. */ CMARK_EXPORT const char *cmark_node_get_title(cmark_node *node); -/** Sets the title of a link or image 'node'. Returns 1 on success, +/** Sets the title of a link, image or reference 'node'. Returns 1 on success, * 0 on failure. */ CMARK_EXPORT int cmark_node_set_title(cmark_node *node, const char *title); +/** Returns the label of a reference 'node', or an empty + string if no label is set. + */ +CMARK_EXPORT const char *cmark_node_get_label(cmark_node *node); + +/** Sets the label of a reference 'node'. Returns 1 on success, + * 0 on failure. + */ +CMARK_EXPORT int cmark_node_set_label(cmark_node *node, const char *label); + /** Returns the literal "on enter" text for a custom 'node', or an empty string if no on_enter is set. */ diff --git a/src/commonmark.c b/src/commonmark.c index b8b182068..7479079ec 100644 --- a/src/commonmark.c +++ b/src/commonmark.c @@ -336,6 +336,9 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, OUT(cmark_node_get_literal(node), allow_wrap, NORMAL); break; + case CMARK_NODE_REFERENCE: + break; + case CMARK_NODE_LINEBREAK: if (!(CMARK_OPT_HARDBREAKS & options)) { LIT(" "); diff --git a/src/html.c b/src/html.c index a680e4a50..c10dcc478 100644 --- a/src/html.c +++ b/src/html.c @@ -217,6 +217,9 @@ static int S_render_node(cmark_node *node, cmark_event_type ev_type, } break; + case CMARK_NODE_REFERENCE: + break; + case CMARK_NODE_TEXT: escape_html(html, node->as.literal.data, node->as.literal.len); break; diff --git a/src/inlines.c b/src/inlines.c index 92e79c787..fccdd9104 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -1190,7 +1190,8 @@ static void spnl(subject *subj) { // Return 0 if no reference found, otherwise position of subject // after reference is parsed. bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, - cmark_reference_map *refmap) { + cmark_reference_map *refmap, + cmark_node *root) { subject subj; cmark_chunk lab; @@ -1199,18 +1200,20 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, bufsize_t matchlen = 0; bufsize_t beforetitle; + cmark_reference *ref; + cmark_node *reference = cmark_node_new(CMARK_NODE_REFERENCE); subject_from_buf(mem, &subj, input, NULL); // parse label: if (!link_label(&subj, &lab) || lab.len == 0) - return 0; + goto nomatch; // colon: if (peek_char(&subj) == ':') { advance(&subj); } else { - return 0; + goto nomatch; } // parse link url: @@ -1220,7 +1223,7 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, url = cmark_chunk_dup(&subj.input, subj.pos, matchlen); subj.pos += matchlen; } else { - return 0; + goto nomatch; } // parse optional link_title @@ -1241,14 +1244,29 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, if (matchlen) { // try rewinding before title subj.pos = beforetitle; skip_spaces(&subj); + title = cmark_chunk_literal(""); if (!skip_line_end(&subj)) { - return 0; + goto nomatch; } } else { - return 0; + goto nomatch; } } // insert reference into refmap - cmark_reference_create(refmap, &lab, &url, &title); + ref = cmark_reference_create(refmap, &lab, &url, &title); + + if (ref) { + cmark_chunk_set_cstr(mem, &reference->as.reference.label, (char *) ref->label); + cmark_chunk_set_cstr(mem, &reference->as.reference.url, cmark_chunk_to_cstr(mem, &ref->url)); + cmark_chunk_set_cstr(mem, &reference->as.reference.title, cmark_chunk_to_cstr(mem, &ref->title)); + cmark_node_append_child(root, reference); + + cmark_reference_add(refmap, ref); + } + return subj.pos; + +nomatch: + cmark_node_free(reference); + return 0; } diff --git a/src/inlines.h b/src/inlines.h index 52be76820..a09a75940 100644 --- a/src/inlines.h +++ b/src/inlines.h @@ -12,7 +12,8 @@ void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, cmark_reference_map *refmap, int options); bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, - cmark_reference_map *refmap); + cmark_reference_map *refmap, + cmark_node *root); #ifdef __cplusplus } diff --git a/src/latex.c b/src/latex.c index e78c7d916..3dd5f0798 100644 --- a/src/latex.c +++ b/src/latex.c @@ -332,6 +332,9 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, } break; + case CMARK_NODE_REFERENCE: + break; + case CMARK_NODE_TEXT: OUT(cmark_node_get_literal(node), allow_wrap, NORMAL); break; diff --git a/src/man.c b/src/man.c index 1c76f68bb..9c2b0629a 100644 --- a/src/man.c +++ b/src/man.c @@ -82,6 +82,7 @@ static int S_render_node(cmark_renderer *renderer, cmark_node *node, switch (node->type) { case CMARK_NODE_DOCUMENT: + case CMARK_NODE_REFERENCE: break; case CMARK_NODE_BLOCK_QUOTE: diff --git a/src/node.c b/src/node.c index e722acf90..44fc28afa 100644 --- a/src/node.c +++ b/src/node.c @@ -65,6 +65,9 @@ static bool S_can_contain(cmark_node *node, cmark_node *child) { case CMARK_NODE_CUSTOM_INLINE: return S_is_inline(child); + case CMARK_NODE_REFERENCE: + return false; + default: break; } @@ -123,6 +126,11 @@ static void S_free_nodes(cmark_node *e) { cmark_chunk_free(NODE_MEM(e), &e->as.link.url); cmark_chunk_free(NODE_MEM(e), &e->as.link.title); break; + case CMARK_NODE_REFERENCE: + cmark_chunk_free(NODE_MEM(e), &e->as.reference.url); + cmark_chunk_free(NODE_MEM(e), &e->as.reference.title); + cmark_chunk_free(NODE_MEM(e), &e->as.reference.label); + break; case CMARK_NODE_CUSTOM_BLOCK: case CMARK_NODE_CUSTOM_INLINE: cmark_chunk_free(NODE_MEM(e), &e->as.custom.on_enter); @@ -182,6 +190,8 @@ const char *cmark_node_get_type_string(cmark_node *node) { return "paragraph"; case CMARK_NODE_HEADING: return "heading"; + case CMARK_NODE_REFERENCE: + return "reference"; case CMARK_NODE_THEMATIC_BREAK: return "thematic_break"; case CMARK_NODE_TEXT: @@ -486,6 +496,8 @@ const char *cmark_node_get_url(cmark_node *node) { case CMARK_NODE_LINK: case CMARK_NODE_IMAGE: return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.link.url); + case CMARK_NODE_REFERENCE: + return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.reference.url); default: break; } @@ -503,6 +515,9 @@ int cmark_node_set_url(cmark_node *node, const char *url) { case CMARK_NODE_IMAGE: cmark_chunk_set_cstr(NODE_MEM(node), &node->as.link.url, url); return 1; + case CMARK_NODE_REFERENCE: + cmark_chunk_set_cstr(NODE_MEM(node), &node->as.reference.url, url); + return 1; default: break; } @@ -519,6 +534,8 @@ const char *cmark_node_get_title(cmark_node *node) { case CMARK_NODE_LINK: case CMARK_NODE_IMAGE: return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.link.title); + case CMARK_NODE_REFERENCE: + return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.reference.title); default: break; } @@ -536,6 +553,40 @@ int cmark_node_set_title(cmark_node *node, const char *title) { case CMARK_NODE_IMAGE: cmark_chunk_set_cstr(NODE_MEM(node), &node->as.link.title, title); return 1; + case CMARK_NODE_REFERENCE: + cmark_chunk_set_cstr(NODE_MEM(node), &node->as.reference.title, title); + return 1; + default: + break; + } + + return 0; +} + +const char *cmark_node_get_label(cmark_node *node) { + if (node == NULL) { + return NULL; + } + + switch (node->type) { + case CMARK_NODE_REFERENCE: + return cmark_chunk_to_cstr(NODE_MEM(node), &node->as.reference.label); + default: + break; + } + + return NULL; +} + +int cmark_node_set_label(cmark_node *node, const char *label) { + if (node == NULL) { + return 0; + } + + switch (node->type) { + case CMARK_NODE_REFERENCE: + cmark_chunk_set_cstr(NODE_MEM(node), &node->as.reference.label, label); + return 1; default: break; } diff --git a/src/node.h b/src/node.h index 65d857f0b..35bd6d4aa 100644 --- a/src/node.h +++ b/src/node.h @@ -41,6 +41,12 @@ typedef struct { cmark_chunk title; } cmark_link; +typedef struct { + cmark_chunk label; + cmark_chunk url; + cmark_chunk title; +} cmark_reference_node; + typedef struct { cmark_chunk on_enter; cmark_chunk on_exit; @@ -75,6 +81,7 @@ struct cmark_node { cmark_code code; cmark_heading heading; cmark_link link; + cmark_reference_node reference; cmark_custom custom; int html_block_type; } as; diff --git a/src/references.c b/src/references.c index 89f2dc8cb..c9e5d06b0 100644 --- a/src/references.c +++ b/src/references.c @@ -53,7 +53,7 @@ static unsigned char *normalize_reference(cmark_mem *mem, cmark_chunk *ref) { return result; } -static void add_reference(cmark_reference_map *map, cmark_reference *ref) { +void cmark_reference_add(cmark_reference_map *map, cmark_reference *ref) { cmark_reference *t = ref->next = map->table[ref->hash % REFMAP_SIZE]; while (t) { @@ -68,14 +68,14 @@ static void add_reference(cmark_reference_map *map, cmark_reference *ref) { map->table[ref->hash % REFMAP_SIZE] = ref; } -void cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, +cmark_reference *cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, cmark_chunk *url, cmark_chunk *title) { cmark_reference *ref; unsigned char *reflabel = normalize_reference(map->mem, label); /* empty reference name, or composed from only whitespace */ if (reflabel == NULL) - return; + return NULL; ref = (cmark_reference *)map->mem->calloc(1, sizeof(*ref)); ref->label = reflabel; @@ -84,7 +84,7 @@ void cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, ref->title = cmark_clean_title(map->mem, title); ref->next = NULL; - add_reference(map, ref); + return ref; } // Returns reference if refmap contains a reference with matching diff --git a/src/references.h b/src/references.h index f075bbbd9..be174533b 100644 --- a/src/references.h +++ b/src/references.h @@ -31,8 +31,9 @@ cmark_reference_map *cmark_reference_map_new(cmark_mem *mem); void cmark_reference_map_free(cmark_reference_map *map); cmark_reference *cmark_reference_lookup(cmark_reference_map *map, cmark_chunk *label); -extern void cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, - cmark_chunk *url, cmark_chunk *title); +void cmark_reference_add(cmark_reference_map *map, cmark_reference *ref); +extern cmark_reference *cmark_reference_create(cmark_reference_map *map, cmark_chunk *label, + cmark_chunk *url, cmark_chunk *title); #ifdef __cplusplus } diff --git a/src/xml.c b/src/xml.c index 4898cd2e8..fcbd93db5 100644 --- a/src/xml.c +++ b/src/xml.c @@ -126,6 +126,17 @@ static int S_render_node(cmark_node *node, cmark_event_type ev_type, escape_xml(xml, node->as.link.title.data, node->as.link.title.len); cmark_strbuf_putc(xml, '"'); break; + case CMARK_NODE_REFERENCE: + cmark_strbuf_puts(xml, " label=\""); + escape_xml(xml, node->as.reference.label.data, node->as.reference.label.len); + cmark_strbuf_putc(xml, '"'); + cmark_strbuf_puts(xml, " destination=\""); + escape_xml(xml, node->as.reference.url.data, node->as.reference.url.len); + cmark_strbuf_putc(xml, '"'); + cmark_strbuf_puts(xml, " title=\""); + escape_xml(xml, node->as.reference.title.data, node->as.reference.title.len); + cmark_strbuf_putc(xml, '"'); + break; default: break; } From 5e16e75f2c77eca94e9d37c04714ad0afa062367 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Fri, 6 Jan 2017 18:06:10 +0100 Subject: [PATCH 2/5] Improve the parser's error handling. * Improve strbuf guarantees Introduce BUFSIZE_MAX macro and make sure that the strbuf implementation can handle strings up to this size. * Abort early if document size exceeds internal limit * Make parser return NULL on internal index overflow Make S_parser_feed set an error and ignore subsequent chunks if the total input document size exceeds an internal limit. Make cmark_parser_finish return NULL if an error was encountered. Add public API functions to retrieve error code and error message. strbuf overflow in renderers and OOM in parser or renderers still cause an abort. --- api_test/main.c | 37 +++++++++++++++++++++++ src/blocks.c | 80 ++++++++++++++++++++++++++++++++++++++++--------- src/buffer.c | 32 +++++++++++++------- src/buffer.h | 20 +++++++++++++ src/cmark.c | 3 ++ src/cmark.h | 22 ++++++++++++-- src/main.c | 5 ++++ src/parser.h | 3 ++ test/cmark.py | 2 ++ 9 files changed, 177 insertions(+), 27 deletions(-) diff --git a/api_test/main.c b/api_test/main.c index 133728700..431e22bbe 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -5,6 +5,7 @@ #define CMARK_NO_SHORT_NAMES #include "cmark.h" #include "node.h" +#include "parser.h" #include "harness.h" #include "cplusplus.h" @@ -897,6 +898,41 @@ static void test_feed_across_line_ending(test_batch_runner *runner) { cmark_node_free(document); } +static cmark_node *S_parse_with_fake_total(bufsize_t fake_total, + const char *str, + cmark_err_type *err) { + cmark_parser *parser = cmark_parser_new(CMARK_OPT_DEFAULT); + parser->total_bytes = fake_total; + cmark_parser_feed(parser, str, strlen(str)); + cmark_node *doc = cmark_parser_finish(parser); + *err = cmark_parser_get_error(parser); + cmark_parser_free(parser); + return doc; +} + +static void test_bufsize_overflow(test_batch_runner *runner) { + cmark_node *doc; + cmark_err_type err; + + doc = S_parse_with_fake_total(BUFSIZE_MAX, "a", &err); + OK(runner, doc == NULL, "parse 1 byte after BUFSIZE_MAX bytes fails"); + INT_EQ(runner, err, CMARK_ERR_INPUT_TOO_LARGE, + "parse 1 byte after BUFSIZE_MAX bytes error code"); + + doc = S_parse_with_fake_total(BUFSIZE_MAX - 9, "0123456789", &err); + OK(runner, doc == NULL, "parse 10 byte after BUFSIZE_MAX-9 bytes fails"); + INT_EQ(runner, err, CMARK_ERR_INPUT_TOO_LARGE, + "parse 10 byte after BUFSIZE_MAX-9 bytes error code"); + + doc = S_parse_with_fake_total(BUFSIZE_MAX - 1, "a", &err); + OK(runner, doc != NULL, "parse 1 byte after BUFSIZE_MAX-1 bytes"); + cmark_node_free(doc); + + doc = S_parse_with_fake_total(BUFSIZE_MAX - 10, "0123456789", &err); + OK(runner, doc != NULL, "parse 10 byte after BUFSIZE_MAX-10 bytes"); + cmark_node_free(doc); +} + int main() { int retval; test_batch_runner *runner = test_batch_runner_new(); @@ -922,6 +958,7 @@ int main() { test_cplusplus(runner); test_safe(runner); test_feed_across_line_ending(runner); + test_bufsize_overflow(runner); test_print_summary(runner); retval = test_ok(runner) ? 0 : 1; diff --git a/src/blocks.c b/src/blocks.c index 078caa893..301178362 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -92,6 +92,8 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) { parser->refmap = cmark_reference_map_new(mem); parser->root = document; parser->current = document; + parser->error_code = CMARK_ERR_NONE; + parser->total_bytes = 0; parser->line_number = 0; parser->offset = 0; parser->column = 0; @@ -526,6 +528,20 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, const unsigned char *end = buffer + len; static const uint8_t repl[] = {239, 191, 189}; + if (parser->error_code) { + return; + } + + // Limit maximum document size to BUFSIZE_MAX. This makes sure that we + // never create strbufs larger than BUFSIZE_MAX. Unfortunately, the + // public API doesn't have an error reporting mechanism, so all we can + // do is to abort. + if (len > (size_t)(BUFSIZE_MAX - parser->total_bytes)) { + parser->error_code = CMARK_ERR_INPUT_TOO_LARGE; + return; + } + parser->total_bytes += (bufsize_t)len; + if (parser->last_buffer_ended_with_cr && *buffer == '\n') { // skip NL if last buffer ended with CR ; see #117 buffer++; @@ -946,8 +962,23 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, // Note that we can have new list items starting with >= 4 // spaces indent, as long as the list container is still open. + cmark_node *list = NULL; + cmark_node *item = NULL; int i = 0; + if (cont_type != CMARK_NODE_LIST || + !lists_match(&((*container)->as.list), data)) { + *container = add_child(parser, *container, CMARK_NODE_LIST, + parser->first_nonspace + 1); + list = *container; + + } + + // add the list item + *container = add_child(parser, *container, CMARK_NODE_ITEM, + parser->first_nonspace + 1); + item = *container; + // compute padding: S_advance_offset(parser, input, parser->first_nonspace + matched - parser->offset, @@ -982,19 +1013,12 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, data->marker_offset = parser->indent; - if (cont_type != CMARK_NODE_LIST || - !lists_match(&((*container)->as.list), data)) { - *container = add_child(parser, *container, CMARK_NODE_LIST, - parser->first_nonspace + 1); - - memcpy(&((*container)->as.list), data, sizeof(*data)); - } - - // add the list item - *container = add_child(parser, *container, CMARK_NODE_ITEM, - parser->first_nonspace + 1); /* TODO: static */ - memcpy(&((*container)->as.list), data, sizeof(*data)); + if (list) + memcpy(&(list->as.list), data, sizeof(*data)); + if (item) + memcpy(&(item->as.list), data, sizeof(*data)); + parser->mem->free(data); } else if (indented && !maybe_lazy && !parser->blank) { S_advance_offset(parser, input, CODE_INDENT, true); @@ -1190,14 +1214,19 @@ cmark_node *cmark_parser_finish(cmark_parser *parser) { cmark_strbuf_clear(&parser->linebuf); } + cmark_strbuf_clear(&parser->curline); + + if (parser->error_code) { + cmark_node_free(parser->root); + return NULL; + } + finalize_document(parser); if (parser->options & CMARK_OPT_NORMALIZE) { cmark_consolidate_text_nodes(parser->root); } - cmark_strbuf_free(&parser->curline); - #if CMARK_DEBUG_NODES if (cmark_node_check(parser->root, stderr)) { abort(); @@ -1205,3 +1234,26 @@ cmark_node *cmark_parser_finish(cmark_parser *parser) { #endif return parser->root; } + +cmark_err_type cmark_parser_get_error(cmark_parser *parser) { + return parser->error_code; +} + +const char *cmark_parser_get_error_message(cmark_parser *parser) { + const char *str = NULL; + + switch (parser->error_code) { + case CMARK_ERR_OUT_OF_MEMORY: + str = "Out of memory"; + break; + case CMARK_ERR_INPUT_TOO_LARGE: + str = "Input too large"; + break; + default: + str = "Unknown error"; + break; + } + + return str; +} + diff --git a/src/buffer.c b/src/buffer.c index a6754b64f..9a9e9adcc 100644 --- a/src/buffer.c +++ b/src/buffer.c @@ -33,6 +33,11 @@ void cmark_strbuf_init(cmark_mem *mem, cmark_strbuf *buf, } static CMARK_INLINE void S_strbuf_grow_by(cmark_strbuf *buf, bufsize_t add) { + // Safety check for overflow. + if (add > BUFSIZE_MAX - buf->size) { + fprintf(stderr, "Internal cmark_strbuf overflow"); + abort(); + } cmark_strbuf_grow(buf, buf->size + add); } @@ -42,18 +47,25 @@ void cmark_strbuf_grow(cmark_strbuf *buf, bufsize_t target_size) { if (target_size < buf->asize) return; - if (target_size > (bufsize_t)(INT32_MAX / 2)) - abort(); - - /* Oversize the buffer by 50% to guarantee amortized linear time - * complexity on append operations. */ - bufsize_t new_size = target_size + target_size / 2; - new_size += 1; - new_size = (new_size + 7) & ~7; + // Oversize the buffer by 50% to guarantee amortized linear time + // complexity on append operations. + bufsize_t add = target_size / 2; + // Account for terminating NUL byte. + add += 1; + // Round up to multiple of eight. + add = (add + 7) & ~7; + + // Check for overflow but allow an additional NUL byte. + if (target_size + add > BUFSIZE_MAX + 1) { + target_size = BUFSIZE_MAX + 1; + } + else { + target_size += add; + } buf->ptr = (unsigned char *)buf->mem->realloc(buf->asize ? buf->ptr : NULL, - new_size); - buf->asize = new_size; + target_size); + buf->asize = target_size; } bufsize_t cmark_strbuf_len(const cmark_strbuf *buf) { return buf->size; } diff --git a/src/buffer.h b/src/buffer.h index e8780753f..7f31a74bb 100644 --- a/src/buffer.h +++ b/src/buffer.h @@ -13,8 +13,28 @@ extern "C" { #endif +#ifndef CMARK_HUGE_DOCS + +// Maximum strbuf size without terminating NUL byte. +#define BUFSIZE_MAX (INT32_MAX - 1) + typedef int32_t bufsize_t; +#else // CMARK_HUGE_DOCS + +// This is an untested proof of concept of how to handle multi-gigabyte +// documents on 64-bit platforms at the expense of internal struct sizes. + +#ifdef PTRDIFF_MAX + #define BUFSIZE_MAX (PTRDIFF_MAX - 1) +#else + #define BUFSIZE_MAX (ptrdiff_t)((size_t)-1 / 2) +#endif + +typedef ptrdiff_t bufsize_t; + +#endif // CMARK_HUGE_DOCS + typedef struct { cmark_mem *mem; unsigned char *ptr; diff --git a/src/cmark.c b/src/cmark.c index 0d3bc1669..ebd933312 100644 --- a/src/cmark.c +++ b/src/cmark.c @@ -31,6 +31,9 @@ char *cmark_markdown_to_html(const char *text, size_t len, int options) { char *result; doc = cmark_parse_document(text, len, options); + if (doc == NULL) { + return NULL; + } result = cmark_render_html(doc, options); cmark_node_free(doc); diff --git a/src/cmark.h b/src/cmark.h index 7f7daaed9..389bc3814 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -22,7 +22,7 @@ extern "C" { /** Convert 'text' (assumed to be a UTF-8 encoded string with length * 'len') from CommonMark Markdown to HTML, returning a null-terminated, * UTF-8-encoded string. It is the caller's responsibility - * to free the returned buffer. + * to free the returned buffer. Returns NULL on error. */ CMARK_EXPORT char *cmark_markdown_to_html(const char *text, size_t len, int options); @@ -84,6 +84,12 @@ typedef enum { CMARK_PAREN_DELIM } cmark_delim_type; +typedef enum { + CMARK_ERR_NONE, + CMARK_ERR_OUT_OF_MEMORY, + CMARK_ERR_INPUT_TOO_LARGE +} cmark_err_type; + typedef struct cmark_node cmark_node; typedef struct cmark_parser cmark_parser; typedef struct cmark_iter cmark_iter; @@ -478,12 +484,22 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem); CMARK_EXPORT void cmark_parser_free(cmark_parser *parser); +/** Return the error code after a failed operation. + */ +CMARK_EXPORT +cmark_err_type cmark_parser_get_error(cmark_parser *parser); + +/** Return the error code after a failed operation. + */ +CMARK_EXPORT +const char *cmark_parser_get_error_message(cmark_parser *parser); + /** Feeds a string of length 'len' to 'parser'. */ CMARK_EXPORT void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len); -/** Finish parsing and return a pointer to a tree of nodes. +/** Finish parsing and return a pointer to a tree of nodes or NULL on error. */ CMARK_EXPORT cmark_node *cmark_parser_finish(cmark_parser *parser); @@ -491,7 +507,7 @@ cmark_node *cmark_parser_finish(cmark_parser *parser); /** Parse a CommonMark document in 'buffer' of length 'len'. * Returns a pointer to a tree of nodes. The memory allocated for * the node tree should be released using 'cmark_node_free' - * when it is no longer needed. + * when it is no longer needed. Returns NULL on error. */ CMARK_EXPORT cmark_node *cmark_parse_document(const char *buffer, size_t len, int options); diff --git a/src/main.c b/src/main.c index 42cd8b163..aeb81de4d 100644 --- a/src/main.c +++ b/src/main.c @@ -181,6 +181,11 @@ int main(int argc, char *argv[]) { document = cmark_parser_finish(parser); cmark_parser_free(parser); + if (document == NULL) { + fprintf(stderr, "%s", cmark_parser_get_error_message(parser)); + exit(1); + } + print_document(document, writer, options, width); cmark_node_free(document); diff --git a/src/parser.h b/src/parser.h index 0c5033bd2..ec8c9b889 100644 --- a/src/parser.h +++ b/src/parser.h @@ -2,6 +2,7 @@ #define CMARK_AST_H #include +#include "cmark.h" #include "node.h" #include "buffer.h" #include "memory.h" @@ -17,6 +18,8 @@ struct cmark_parser { struct cmark_reference_map *refmap; struct cmark_node *root; struct cmark_node *current; + cmark_err_type error_code; + bufsize_t total_bytes; int line_number; bufsize_t offset; bufsize_t column; diff --git a/test/cmark.py b/test/cmark.py index 4be85a3b0..f4ff5765b 100644 --- a/test/cmark.py +++ b/test/cmark.py @@ -30,6 +30,8 @@ def to_commonmark(lib, text): render_commonmark.restype = c_char_p render_commonmark.argtypes = [c_void_p, c_int, c_int] node = parse_document(textbytes, textlen, 0) + if node is None: + raise Exception("parse_document failed") result = render_commonmark(node, 0, 0).decode('utf-8') return [0, result, ''] From 550e3eb43ee4ed1e10045c0072c57ffe189e0afa Mon Sep 17 00:00:00 2001 From: Mathieu Duponchelle Date: Fri, 6 Jan 2017 18:11:48 +0100 Subject: [PATCH 3/5] Implement and expose a source map. Its generation is conditioned to the setting of the OPT_SOURCEPOS flag. Add cmark_parser_get_first_source_extent(parser). Add cmark_source_extent_get_start(extent). Add cmark_source_extent_get_stop(extent). Add cmark_source_extent_get_next(extent). Add cmark_source_extent_get_previous(extent). Add cmark_source_extent_get_node(extent). Add cmark_source_extent_get_type(extent. Add cmark_source_extent_get_type_string(extent). API change. --- src/CMakeLists.txt | 2 + src/blocks.c | 170 ++++++++++++++++------ src/cmark.h | 55 ++++++++ src/inlines.c | 149 ++++++++++++++++++-- src/inlines.h | 10 +- src/parser.h | 4 + src/source_map.c | 344 +++++++++++++++++++++++++++++++++++++++++++++ src/source_map.h | 74 ++++++++++ test/cmark.py | 5 +- 9 files changed, 755 insertions(+), 58 deletions(-) create mode 100644 src/source_map.c create mode 100644 src/source_map.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f52ded6ca..c7761ff3b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -18,6 +18,7 @@ set(HEADERS houdini.h cmark_ctype.h render.h + source_map.h ) set(LIBRARY_SOURCES cmark.c @@ -40,6 +41,7 @@ set(LIBRARY_SOURCES houdini_html_e.c houdini_html_u.c cmark_ctype.c + source_map.c ${HEADERS} ) diff --git a/src/blocks.c b/src/blocks.c index 301178362..4abe7d3c4 100644 --- a/src/blocks.c +++ b/src/blocks.c @@ -28,6 +28,10 @@ #define MIN(x, y) ((x < y) ? x : y) #endif +#ifndef MAX +#define MAX(x, y) ((x > y) ? x : y) +#endif + #define peek_at(i, n) (i)->data[n] static bool S_last_line_blank(const cmark_node *node) { @@ -95,6 +99,7 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) { parser->error_code = CMARK_ERR_NONE; parser->total_bytes = 0; parser->line_number = 0; + parser->line_offset = 0; parser->offset = 0; parser->column = 0; parser->first_nonspace = 0; @@ -106,6 +111,9 @@ cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) { parser->options = options; parser->last_buffer_ended_with_cr = false; + if (options & CMARK_OPT_SOURCEPOS) + parser->source_map = source_map_new(mem); + return parser; } @@ -118,6 +126,7 @@ void cmark_parser_free(cmark_parser *parser) { cmark_mem *mem = parser->mem; cmark_strbuf_free(&parser->curline); cmark_strbuf_free(&parser->linebuf); + source_map_free(parser->source_map); cmark_reference_map_free(parser->refmap); mem->free(parser); } @@ -257,18 +266,28 @@ static cmark_node *finalize(cmark_parser *parser, cmark_node *b) { switch (S_type(b)) { case CMARK_NODE_PARAGRAPH: + source_map_start_cursor(parser->source_map, parser->last_paragraph_extent); while (cmark_strbuf_at(node_content, 0) == '[' && (pos = cmark_parse_reference_inline(parser->mem, node_content, - parser->refmap, parser->root))) { - + parser->refmap, b, + parser->source_map))) { + source_map_start_cursor(parser->source_map, + source_map_get_cursor(parser->source_map)); cmark_strbuf_drop(node_content, pos); } + + while (parser->last_paragraph_extent != source_map_get_cursor(parser->source_map)) { + if (parser->last_paragraph_extent->node == b) { + parser->last_paragraph_extent->node = parser->root; + } + parser->last_paragraph_extent = parser->last_paragraph_extent->next; + } + if (is_blank(node_content, 0)) { // remove blank node (former reference def) cmark_node_free(b); } break; - case CMARK_NODE_CODE_BLOCK: if (!b->as.code.fenced) { // indented code remove_trailing_blank_lines(node_content); @@ -363,21 +382,36 @@ static cmark_node *add_child(cmark_parser *parser, cmark_node *parent, // Walk through node and all children, recursively, parsing // string content into inline content where appropriate. -static void process_inlines(cmark_mem *mem, cmark_node *root, - cmark_reference_map *refmap, int options) { - cmark_iter *iter = cmark_iter_new(root); +static void process_inlines(cmark_parser *parser) { + cmark_iter *iter = cmark_iter_new(parser->root); cmark_node *cur; cmark_event_type ev_type; + cmark_source_extent *cur_extent = NULL; + + cur_extent = source_map_get_head(parser->source_map); while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) { cur = cmark_iter_get_node(iter); if (ev_type == CMARK_EVENT_ENTER) { if (contains_inlines(S_type(cur))) { - cmark_parse_inlines(mem, cur, refmap, options); + while (cur_extent && cur_extent->node != cur) { + cur_extent = source_map_stitch_extent(parser->source_map, cur_extent, parser->root, cur, parser->line_offset); + } + + if (parser->source_map) + assert(cur_extent); + + source_map_start_cursor(parser->source_map, cur_extent); + + cmark_parse_inlines(parser->mem, cur, parser->refmap, parser->options, parser->source_map, parser->line_offset); } } } + while (cur_extent) { + cur_extent = source_map_stitch_extent(parser->source_map, cur_extent, parser->root, NULL, parser->line_offset); + } + cmark_iter_free(iter); } @@ -484,7 +518,10 @@ static cmark_node *finalize_document(cmark_parser *parser) { } finalize(parser, parser->root); - process_inlines(parser->mem, parser->root, parser->refmap, parser->options); + + process_inlines(parser); + + assert(source_map_check(parser->source_map, parser->line_offset)); return parser->root; } @@ -526,6 +563,7 @@ void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len) { static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, size_t len, bool eof) { const unsigned char *end = buffer + len; + const unsigned char *skipped; static const uint8_t repl[] = {239, 191, 189}; if (parser->error_code) { @@ -550,6 +588,7 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, while (buffer < end) { const unsigned char *eol; bufsize_t chunk_len; + bufsize_t linebuf_size = 0; bool process = false; for (eol = buffer; eol < end; ++eol) { if (S_is_line_end_char(*eol)) { @@ -567,6 +606,7 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, chunk_len = (eol - buffer); if (process) { if (parser->linebuf.size > 0) { + linebuf_size = cmark_strbuf_len(&parser->linebuf); cmark_strbuf_put(&parser->linebuf, buffer, chunk_len); S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size); cmark_strbuf_clear(&parser->linebuf); @@ -585,6 +625,8 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, } buffer += chunk_len; + skipped = buffer; + if (buffer < end) { if (*buffer == '\0') { // skip over NULL @@ -600,6 +642,11 @@ static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer, buffer++; } } + chunk_len += buffer - skipped; + chunk_len += linebuf_size; + + if (process) + parser->line_offset += chunk_len; } } @@ -659,11 +706,13 @@ static void S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input) { // indicates a number of columns; otherwise, a number of bytes. // If advancing a certain number of columns partially consumes // a tab character, parser->partially_consumed_tab is set to true. -static void S_advance_offset(cmark_parser *parser, cmark_chunk *input, - bufsize_t count, bool columns) { +static void S_advance_offset(cmark_parser *parser, cmark_node *container, cmark_extent_type type, + cmark_chunk *input, bufsize_t count, bool columns) { char c; int chars_to_tab; int chars_to_advance; + int initial_pos = parser->offset + parser->line_offset; + while (count > 0 && (c = peek_at(input, parser->offset))) { if (c == '\t') { chars_to_tab = TAB_STOP - (parser->column % TAB_STOP); @@ -686,6 +735,8 @@ static void S_advance_offset(cmark_parser *parser, cmark_chunk *input, count -= 1; } } + + source_map_append_extent(parser->source_map, initial_pos, parser->offset + parser->line_offset, container, type); } static bool S_last_child_is_open(cmark_node *container) { @@ -693,7 +744,7 @@ static bool S_last_child_is_open(cmark_node *container) { (container->last_child->flags & CMARK_NODE__OPEN); } -static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) { +static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input, cmark_node *container) { bool res = false; bufsize_t matched = 0; @@ -701,10 +752,10 @@ static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) { parser->indent <= 3 && peek_at(input, parser->first_nonspace) == '>'; if (matched) { - S_advance_offset(parser, input, parser->indent + 1, true); + S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, parser->indent + 1, true); if (S_is_space_or_tab(peek_at(input, parser->offset))) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, 1, true); } res = true; @@ -718,7 +769,7 @@ static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input, if (parser->indent >= container->as.list.marker_offset + container->as.list.padding) { - S_advance_offset(parser, input, container->as.list.marker_offset + + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, container->as.list.marker_offset + container->as.list.padding, true); res = true; @@ -726,7 +777,7 @@ static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input, // if container->first_child is NULL, then the opening line // of the list item was blank after the list marker; in this // case, we are done with the list item. - S_advance_offset(parser, input, parser->first_nonspace - parser->offset, + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace - parser->offset, false); res = true; } @@ -740,10 +791,10 @@ static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input, if (!container->as.code.fenced) { // indented if (parser->indent >= CODE_INDENT) { - S_advance_offset(parser, input, CODE_INDENT, true); + S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, CODE_INDENT, true); res = true; } else if (parser->blank) { - S_advance_offset(parser, input, parser->first_nonspace - parser->offset, + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace - parser->offset, false); res = true; } @@ -759,14 +810,15 @@ static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input, // closing fence - and since we're at // the end of a line, we can stop processing it: *should_continue = false; - S_advance_offset(parser, input, matched, false); + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace, false); + S_advance_offset(parser, container, CMARK_EXTENT_CLOSER, input, parser->offset + matched, false); parser->current = finalize(parser, container); } else { // skip opt. spaces of fence parser->offset int i = container->as.code.fence_offset; while (i > 0 && S_is_space_or_tab(peek_at(input, parser->offset))) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, 1, true); i--; } res = true; @@ -823,7 +875,7 @@ static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input, switch (cont_type) { case CMARK_NODE_BLOCK_QUOTE: - if (!parse_block_quote_prefix(parser, input)) + if (!parse_block_quote_prefix(parser, input, container)) goto done; break; case CMARK_NODE_ITEM: @@ -883,29 +935,26 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, indented = parser->indent >= CODE_INDENT; if (!indented && peek_at(input, parser->first_nonspace) == '>') { + *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE, + parser->first_nonspace + 1); - bufsize_t blockquote_startpos = parser->first_nonspace; - - S_advance_offset(parser, input, + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, parser->first_nonspace + 1 - parser->offset, false); // optional following character if (S_is_space_or_tab(peek_at(input, parser->offset))) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, *container, CMARK_EXTENT_BLANK, input, 1, true); } - *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE, - blockquote_startpos + 1); } else if (!indented && (matched = scan_atx_heading_start( input, parser->first_nonspace))) { bufsize_t hashpos; int level = 0; - bufsize_t heading_startpos = parser->first_nonspace; - S_advance_offset(parser, input, + *container = add_child(parser, *container, CMARK_NODE_HEADING, + parser->first_nonspace + 1); + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, parser->first_nonspace + matched - parser->offset, false); - *container = add_child(parser, *container, CMARK_NODE_HEADING, - heading_startpos + 1); hashpos = cmark_chunk_strchr(input, '#', parser->first_nonspace); @@ -927,7 +976,7 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, (*container)->as.code.fence_offset = (int8_t)(parser->first_nonspace - parser->offset); (*container)->as.code.info = cmark_chunk_literal(""); - S_advance_offset(parser, input, + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, parser->first_nonspace + matched - parser->offset, false); @@ -947,14 +996,14 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, (*container)->type = (uint16_t)CMARK_NODE_HEADING; (*container)->as.heading.level = lev; (*container)->as.heading.setext = true; - S_advance_offset(parser, input, input->len - 1 - parser->offset, false); + S_advance_offset(parser, *container, CMARK_EXTENT_CLOSER, input, input->len - 1 - parser->offset, false); } else if (!indented && !(cont_type == CMARK_NODE_PARAGRAPH && !all_matched) && (matched = scan_thematic_break(input, parser->first_nonspace))) { // it's only now that we know the line is not part of a setext heading: *container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK, parser->first_nonspace + 1); - S_advance_offset(parser, input, input->len - 1 - parser->offset, false); + S_advance_offset(parser, *container, CMARK_EXTENT_CONTENT, input, input->len - 1 - parser->offset, false); } else if ((!indented || cont_type == CMARK_NODE_LIST) && (matched = parse_list_marker( parser->mem, input, parser->first_nonspace, @@ -964,6 +1013,7 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, // spaces indent, as long as the list container is still open. cmark_node *list = NULL; cmark_node *item = NULL; + cmark_source_extent *save_source_map_tail; int i = 0; if (cont_type != CMARK_NODE_LIST || @@ -980,17 +1030,18 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, item = *container; // compute padding: - S_advance_offset(parser, input, + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, parser->first_nonspace + matched - parser->offset, false); save_partially_consumed_tab = parser->partially_consumed_tab; save_offset = parser->offset; save_column = parser->column; + save_source_map_tail = source_map_get_tail(parser->source_map); while (parser->column - save_column <= 5 && S_is_space_or_tab(peek_at(input, parser->offset))) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, *container, CMARK_EXTENT_BLANK, input, 1, true); } i = parser->column - save_column; @@ -1000,9 +1051,14 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, data->padding = matched + 1; parser->offset = save_offset; parser->column = save_column; + if (save_source_map_tail) { + cmark_source_extent *tmp_extent; + for (tmp_extent = save_source_map_tail->next; tmp_extent; tmp_extent = source_map_free_extent(parser->source_map, tmp_extent)); + } + parser->partially_consumed_tab = save_partially_consumed_tab; if (i > 0) { - S_advance_offset(parser, input, 1, true); + S_advance_offset(parser, *container, CMARK_EXTENT_BLANK, input, 1, true); } } else { data->padding = matched + i; @@ -1021,7 +1077,6 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, parser->mem->free(data); } else if (indented && !maybe_lazy && !parser->blank) { - S_advance_offset(parser, input, CODE_INDENT, true); *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK, parser->offset + 1); (*container)->as.code.fenced = false; @@ -1030,6 +1085,7 @@ static void open_new_blocks(cmark_parser *parser, cmark_node **container, (*container)->as.code.fence_offset = 0; (*container)->as.code.info = cmark_chunk_literal(""); + S_advance_offset(parser, *container, CMARK_EXTENT_OPENER, input, CODE_INDENT, true); } else { break; } @@ -1094,6 +1150,11 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container, } if (S_type(container) == CMARK_NODE_CODE_BLOCK) { + source_map_append_extent(parser->source_map, + parser->offset + parser->line_offset, + parser->line_offset + input->len, + container, + CMARK_EXTENT_CONTENT); add_line(container, input, parser); } else if (S_type(container) == CMARK_NODE_HTML_BLOCK) { add_line(container, input, parser); @@ -1130,26 +1191,48 @@ static void add_text_to_container(cmark_parser *parser, cmark_node *container, break; } + source_map_append_extent(parser->source_map, + parser->offset + parser->line_offset, + parser->line_offset + input->len, + container, + CMARK_EXTENT_CONTENT); + if (matches_end_condition) { container = finalize(parser, container); assert(parser->current != NULL); } } else if (parser->blank) { - // ??? do nothing + source_map_append_extent(parser->source_map, + parser->line_offset + parser->offset, + parser->line_offset + input->len, + container, + CMARK_EXTENT_BLANK); } else if (accepts_lines(S_type(container))) { + bufsize_t initial_len = input->len; + bool chopped = false; + if (S_type(container) == CMARK_NODE_HEADING && container->as.heading.setext == false) { chop_trailing_hashtags(input); + chopped = true; } - S_advance_offset(parser, input, parser->first_nonspace - parser->offset, + S_advance_offset(parser, container, CMARK_EXTENT_BLANK, input, parser->first_nonspace - parser->offset, false); add_line(container, input, parser); + + if (chopped) + source_map_append_extent(parser->source_map, + MAX(parser->line_offset + parser->offset, parser->line_offset + input->len), + parser->line_offset + initial_len, + container, + CMARK_EXTENT_CLOSER); } else { // create paragraph container for line container = add_child(parser, container, CMARK_NODE_PARAGRAPH, parser->first_nonspace + 1); - S_advance_offset(parser, input, parser->first_nonspace - parser->offset, + S_advance_offset(parser, container, CMARK_EXTENT_OPENER, input, parser->first_nonspace - parser->offset, false); + parser->last_paragraph_extent = source_map_get_tail(parser->source_map); add_line(container, input, parser); } @@ -1211,6 +1294,7 @@ static void S_process_line(cmark_parser *parser, const unsigned char *buffer, cmark_node *cmark_parser_finish(cmark_parser *parser) { if (parser->linebuf.size) { S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size); + parser->line_offset += parser->linebuf.size; cmark_strbuf_clear(&parser->linebuf); } @@ -1235,6 +1319,12 @@ cmark_node *cmark_parser_finish(cmark_parser *parser) { return parser->root; } +cmark_source_extent * +cmark_parser_get_first_source_extent(cmark_parser *parser) +{ + return source_map_get_head(parser->source_map); +} + cmark_err_type cmark_parser_get_error(cmark_parser *parser) { return parser->error_code; } diff --git a/src/cmark.h b/src/cmark.h index 389bc3814..cc1e7f527 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -66,6 +66,21 @@ typedef enum { CMARK_NODE_LAST_INLINE = CMARK_NODE_IMAGE, } cmark_node_type; +typedef enum { + CMARK_EXTENT_NONE, + CMARK_EXTENT_OPENER, + CMARK_EXTENT_CLOSER, + CMARK_EXTENT_BLANK, + CMARK_EXTENT_CONTENT, + CMARK_EXTENT_PUNCTUATION, + CMARK_EXTENT_LINK_DESTINATION, + CMARK_EXTENT_LINK_TITLE, + CMARK_EXTENT_LINK_LABEL, + CMARK_EXTENT_REFERENCE_DESTINATION, + CMARK_EXTENT_REFERENCE_LABEL, + CMARK_EXTENT_REFERENCE_TITLE, +} cmark_extent_type; + /* For backwards compatibility: */ #define CMARK_NODE_HEADER CMARK_NODE_HEADING #define CMARK_NODE_HRULE CMARK_NODE_THEMATIC_BREAK @@ -93,6 +108,7 @@ typedef enum { typedef struct cmark_node cmark_node; typedef struct cmark_parser cmark_parser; typedef struct cmark_iter cmark_iter; +typedef struct cmark_source_extent cmark_source_extent; /** * ## Custom memory allocator support @@ -504,6 +520,11 @@ void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len); CMARK_EXPORT cmark_node *cmark_parser_finish(cmark_parser *parser); +/** Return a pointer to the first extent of the parser's source map + */ +CMARK_EXPORT +cmark_source_extent *cmark_parser_get_first_source_extent(cmark_parser *parser); + /** Parse a CommonMark document in 'buffer' of length 'len'. * Returns a pointer to a tree of nodes. The memory allocated for * the node tree should be released using 'cmark_node_free' @@ -515,10 +536,44 @@ cmark_node *cmark_parse_document(const char *buffer, size_t len, int options); /** Parse a CommonMark document in file 'f', returning a pointer to * a tree of nodes. The memory allocated for the node tree should be * released using 'cmark_node_free' when it is no longer needed. + * Returns NULL on error. */ CMARK_EXPORT cmark_node *cmark_parse_file(FILE *f, int options); +/** + * ## Source map API + */ + +/* Return the index, in bytes, of the start of this extent */ +CMARK_EXPORT +size_t cmark_source_extent_get_start(cmark_source_extent *extent); + +/* Return the index, in bytes, of the stop of this extent. This + * index is not included in the extent*/ +CMARK_EXPORT +size_t cmark_source_extent_get_stop(cmark_source_extent *extent); + +/* Return the extent immediately following 'extent' */ +CMARK_EXPORT +cmark_source_extent *cmark_source_extent_get_next(cmark_source_extent *extent); + +/* Return the extent immediately preceding 'extent' */ +CMARK_EXPORT +cmark_source_extent *cmark_source_extent_get_previous(cmark_source_extent *extent); + +/* Return the node 'extent' maps to */ +CMARK_EXPORT +cmark_node *cmark_source_extent_get_node(cmark_source_extent *extent); + +/* Return the type of 'extent' */ +CMARK_EXPORT +cmark_extent_type cmark_source_extent_get_type(cmark_source_extent *extent); + +/* Return a string representation of 'extent' */ +CMARK_EXPORT +const char *cmark_source_extent_get_type_string(cmark_source_extent *extent); + /** * ## Rendering */ diff --git a/src/inlines.c b/src/inlines.c index fccdd9104..d2378b53f 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -13,6 +13,10 @@ #include "scanners.h" #include "inlines.h" +#ifndef MIN +#define MIN(x, y) ((x < y) ? x : y) +#endif + static const char *EMDASH = "\xE2\x80\x94"; static const char *ENDASH = "\xE2\x80\x93"; static const char *ELLIPSES = "\xE2\x80\xA6"; @@ -40,6 +44,7 @@ typedef struct delimiter { unsigned char delim_char; bool can_open; bool can_close; + cmark_source_extent *extent; } delimiter; typedef struct bracket { @@ -50,6 +55,7 @@ typedef struct bracket { bool image; bool active; bool bracket_after; + cmark_source_extent *extent; } bracket; typedef struct { @@ -61,6 +67,7 @@ typedef struct { bracket *last_bracket; bufsize_t backticks[MAXBACKTICKS + 1]; bool scanned_for_backticks; + cmark_source_map *source_map; } subject; static CMARK_INLINE bool S_is_line_end_char(char c) { @@ -73,7 +80,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, static int parse_inline(subject *subj, cmark_node *parent, int options); static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, - cmark_reference_map *refmap); + cmark_reference_map *refmap, cmark_source_map *source_map); static bufsize_t subject_find_special_char(subject *subj, int options); // Create an inline with a literal string value. @@ -149,7 +156,7 @@ static CMARK_INLINE cmark_node *make_autolink(cmark_mem *mem, cmark_chunk url, } static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, - cmark_reference_map *refmap) { + cmark_reference_map *refmap, cmark_source_map *source_map) { int i; e->mem = mem; e->input.data = buffer->ptr; @@ -159,6 +166,7 @@ static void subject_from_buf(cmark_mem *mem, subject *e, cmark_strbuf *buffer, e->refmap = refmap; e->last_delim = NULL; e->last_bracket = NULL; + e->source_map = source_map; for (i = 0; i <= MAXBACKTICKS; i++) { e->backticks[i] = 0; } @@ -406,6 +414,7 @@ static void push_delimiter(subject *subj, unsigned char c, bool can_open, if (delim->previous != NULL) { delim->previous->next = delim; } + delim->extent = NULL; subj->last_delim = delim; } @@ -421,11 +430,12 @@ static void push_bracket(subject *subj, bool image, cmark_node *inl_text) { b->previous_delimiter = subj->last_delim; b->position = subj->pos; b->bracket_after = false; + b->extent = NULL; subj->last_bracket = b; } // Assumes the subject has a c at the current position. -static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { +static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart, bool *pushed) { bufsize_t numdelims; cmark_node *inl_text; bool can_open, can_close; @@ -446,6 +456,9 @@ static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) { if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) { push_delimiter(subj, c, can_open, can_close, inl_text); + *pushed = true; + } else { + *pushed = false; } return inl_text; @@ -612,6 +625,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, bufsize_t opener_num_chars = opener_inl->as.literal.len; bufsize_t closer_num_chars = closer_inl->as.literal.len; cmark_node *tmp, *tmpnext, *emph; + cmark_source_extent *tmp_extent = NULL; // calculate the actual number of characters used from this closer if (closer_num_chars < 3 || opener_num_chars < 3) { @@ -647,9 +661,30 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, } cmark_node_insert_after(opener_inl, emph); + if (subj->source_map) { + tmp_extent = closer->extent->prev; + + source_map_insert_extent(subj->source_map, + opener->extent, + opener->extent->stop - use_delims, + opener->extent->stop, + emph, + CMARK_EXTENT_OPENER); + opener->extent->stop -= use_delims; + + source_map_insert_extent(subj->source_map, + tmp_extent, + closer->extent->start, + closer->extent->start + use_delims, + emph, + CMARK_EXTENT_CLOSER); + closer->extent->start += use_delims; + } + // if opener has 0 characters, remove it and its associated inline if (opener_num_chars == 0) { cmark_node_free(opener_inl); + source_map_free_extent(subj->source_map, opener->extent); remove_delimiter(subj, opener); } @@ -659,6 +694,7 @@ static delimiter *S_insert_emph(subject *subj, delimiter *opener, cmark_node_free(closer_inl); // remove closer from list tmp_delim = closer->next; + source_map_free_extent(subj->source_map, closer->extent); remove_delimiter(subj, closer); closer = tmp_delim; } @@ -883,6 +919,8 @@ static cmark_node *handle_close_bracket(subject *subj) { int found_label; cmark_node *tmp, *tmpnext; bool is_image; + bool is_inline = false; + bool is_shortcut = false; advance(subj); // advance past ] initial_pos = subj->pos; @@ -933,6 +971,7 @@ static cmark_node *handle_close_bracket(subject *subj) { title = cmark_clean_title(subj->mem, &title_chunk); cmark_chunk_free(subj->mem, &url_chunk); cmark_chunk_free(subj->mem, &title_chunk); + is_inline = true; goto match; } else { @@ -955,6 +994,7 @@ static cmark_node *handle_close_bracket(subject *subj) { cmark_chunk_free(subj->mem, &raw_label); raw_label = cmark_chunk_dup(&subj->input, opener->position, initial_pos - opener->position - 1); + is_shortcut = true; found_label = true; } @@ -984,6 +1024,31 @@ static cmark_node *handle_close_bracket(subject *subj) { cmark_node_insert_before(opener->inl_text, inl); // Add link text: tmp = opener->inl_text->next; + + if (subj->source_map) { + assert(opener->extent); + + opener->extent->node = inl; + opener->extent->type = CMARK_EXTENT_OPENER; + } + + source_map_splice_extent(subj->source_map, initial_pos - 1, initial_pos, inl, CMARK_EXTENT_PUNCTUATION); + if (is_inline) { + source_map_splice_extent(subj->source_map, after_link_text_pos, starturl, inl, CMARK_EXTENT_PUNCTUATION); + source_map_splice_extent(subj->source_map, starturl, endurl, inl, CMARK_EXTENT_LINK_DESTINATION); + if (endtitle != starttitle) { + source_map_splice_extent(subj->source_map, endurl, starttitle, inl, CMARK_EXTENT_BLANK); + source_map_splice_extent(subj->source_map, starttitle, endtitle, inl, CMARK_EXTENT_LINK_TITLE); + source_map_splice_extent(subj->source_map, endtitle, subj->pos, inl, CMARK_EXTENT_PUNCTUATION); + } else { + source_map_splice_extent(subj->source_map, endurl, subj->pos, inl, CMARK_EXTENT_PUNCTUATION); + } + } else if (!is_shortcut) { + source_map_splice_extent(subj->source_map, initial_pos, initial_pos + 1, inl, CMARK_EXTENT_PUNCTUATION); + source_map_splice_extent(subj->source_map, initial_pos + 1, subj->pos - 1, inl, CMARK_EXTENT_LINK_LABEL); + source_map_splice_extent(subj->source_map, subj->pos - 1, subj->pos, inl, CMARK_EXTENT_PUNCTUATION); + } + while (tmp) { tmpnext = tmp->next; cmark_node_append_child(inl, tmp); @@ -1087,6 +1152,11 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { cmark_chunk contents; unsigned char c; bufsize_t endpos; + bufsize_t startpos = subj->pos; + bufsize_t trimmed_spaces = 0; + bool add_extent_to_last_bracket = false; + bool add_extent_to_last_delimiter = false; + c = peek_char(subj); if (c == 0) { return 0; @@ -1095,6 +1165,8 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { case '\r': case '\n': new_inl = handle_newline(subj); + if (new_inl->type == CMARK_NODE_LINEBREAK) + startpos -= 2; break; case '`': new_inl = handle_backticks(subj); @@ -1112,7 +1184,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { case '_': case '\'': case '"': - new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0); + new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0, &add_extent_to_last_delimiter); break; case '-': new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0); @@ -1124,6 +1196,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { advance(subj); new_inl = make_str(subj->mem, cmark_chunk_literal("[")); push_bracket(subj, false, new_inl); + add_extent_to_last_bracket = true; break; case ']': new_inl = handle_close_bracket(subj); @@ -1134,6 +1207,7 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { advance(subj); new_inl = make_str(subj->mem, cmark_chunk_literal("![")); push_bracket(subj, true, new_inl); + add_extent_to_last_bracket = true; } else { new_inl = make_str(subj->mem, cmark_chunk_literal("!")); } @@ -1145,12 +1219,24 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { // if we're at a newline, strip trailing spaces. if (S_is_line_end_char(peek_char(subj))) { + bufsize_t initial_size = contents.len; cmark_chunk_rtrim(&contents); + trimmed_spaces = initial_size - contents.len; } new_inl = make_str(subj->mem, contents); } + if (new_inl != NULL) { + cmark_source_extent *extent; + + extent = source_map_splice_extent(subj->source_map, startpos, subj->pos - trimmed_spaces, new_inl, CMARK_EXTENT_CONTENT); + + if (add_extent_to_last_bracket) + subj->last_bracket->extent = extent; + else if (add_extent_to_last_delimiter) + subj->last_delim->extent = extent; + cmark_node_append_child(parent, new_inl); } @@ -1159,9 +1245,11 @@ static int parse_inline(subject *subj, cmark_node *parent, int options) { // Parse inlines from parent's string_content, adding as children of parent. extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, - cmark_reference_map *refmap, int options) { + cmark_reference_map *refmap, int options, + cmark_source_map *source_map, bufsize_t total_length) { subject subj; - subject_from_buf(mem, &subj, &parent->content, refmap); + subject_from_buf(mem, &subj, &parent->content, refmap, source_map); + bufsize_t initial_len = subj.input.len; cmark_chunk_rtrim(&subj.input); while (!is_eof(&subj) && parse_inline(&subj, parent, options)) @@ -1175,6 +1263,14 @@ extern void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, while (subj.last_bracket) { pop_bracket(&subj); } + + if (source_map) + source_map_insert_extent(source_map, + source_map->cursor, + source_map->cursor->stop, + MIN(source_map->cursor->stop + initial_len - subj.input.len, total_length), + parent, + CMARK_EXTENT_BLANK); } // Parse zero or more space characters, including at most one newline. @@ -1191,24 +1287,29 @@ static void spnl(subject *subj) { // after reference is parsed. bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, cmark_reference_map *refmap, - cmark_node *root) { + cmark_node *container, + cmark_source_map *source_map) { subject subj; + cmark_node *reference = cmark_node_new(CMARK_NODE_REFERENCE); + cmark_reference *ref; cmark_chunk lab; cmark_chunk url; cmark_chunk title; bufsize_t matchlen = 0; - bufsize_t beforetitle; - cmark_reference *ref; - cmark_node *reference = cmark_node_new(CMARK_NODE_REFERENCE); + bufsize_t starttitle, endtitle; + bufsize_t endlabel; + bufsize_t starturl, endurl; - subject_from_buf(mem, &subj, input, NULL); + subject_from_buf(mem, &subj, input, NULL, source_map); // parse label: if (!link_label(&subj, &lab) || lab.len == 0) goto nomatch; + endlabel = subj.pos - 1; + // colon: if (peek_char(&subj) == ':') { advance(&subj); @@ -1218,6 +1319,7 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, // parse link url: spnl(&subj); + starturl = subj.pos; matchlen = manual_scan_link_url(&subj.input, subj.pos); if (matchlen > 0) { url = cmark_chunk_dup(&subj.input, subj.pos, matchlen); @@ -1227,22 +1329,29 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, } // parse optional link_title - beforetitle = subj.pos; + endurl = subj.pos; spnl(&subj); + starttitle = subj.pos; matchlen = scan_link_title(&subj.input, subj.pos); if (matchlen) { title = cmark_chunk_dup(&subj.input, subj.pos, matchlen); subj.pos += matchlen; } else { - subj.pos = beforetitle; + subj.pos = endurl; + starttitle = endurl; + endtitle = endurl; title = cmark_chunk_literal(""); } + endtitle = subj.pos; + // parse final spaces and newline: skip_spaces(&subj); if (!skip_line_end(&subj)) { if (matchlen) { // try rewinding before title - subj.pos = beforetitle; + subj.pos = endurl; + starttitle = endurl; + endtitle = endurl; skip_spaces(&subj); title = cmark_chunk_literal(""); if (!skip_line_end(&subj)) { @@ -1259,11 +1368,21 @@ bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, cmark_chunk_set_cstr(mem, &reference->as.reference.label, (char *) ref->label); cmark_chunk_set_cstr(mem, &reference->as.reference.url, cmark_chunk_to_cstr(mem, &ref->url)); cmark_chunk_set_cstr(mem, &reference->as.reference.title, cmark_chunk_to_cstr(mem, &ref->title)); - cmark_node_append_child(root, reference); + cmark_node_insert_before(container, reference); cmark_reference_add(refmap, ref); } + // Mark the extents of the reference + source_map_splice_extent(source_map, 0, 1, reference, CMARK_EXTENT_OPENER); + source_map_splice_extent(source_map, 1, endlabel, reference, CMARK_EXTENT_REFERENCE_LABEL); + source_map_splice_extent(source_map, endlabel, endlabel + 2, reference, CMARK_EXTENT_PUNCTUATION); + source_map_splice_extent(source_map, endlabel + 2, starturl, reference, CMARK_EXTENT_BLANK); + source_map_splice_extent(source_map, starturl, endurl, reference, CMARK_EXTENT_REFERENCE_DESTINATION); + source_map_splice_extent(source_map, endurl, starttitle, reference, CMARK_EXTENT_BLANK); + source_map_splice_extent(source_map, starttitle, endtitle, reference, CMARK_EXTENT_REFERENCE_TITLE); + source_map_splice_extent(source_map, endtitle, subj.pos, reference, CMARK_EXTENT_BLANK); + return subj.pos; nomatch: diff --git a/src/inlines.h b/src/inlines.h index a09a75940..ee85b87de 100644 --- a/src/inlines.h +++ b/src/inlines.h @@ -1,6 +1,10 @@ #ifndef CMARK_INLINES_H #define CMARK_INLINES_H +#include "chunk.h" +#include "references.h" +#include "source_map.h" + #ifdef __cplusplus extern "C" { #endif @@ -9,11 +13,13 @@ cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url); cmark_chunk cmark_clean_title(cmark_mem *mem, cmark_chunk *title); void cmark_parse_inlines(cmark_mem *mem, cmark_node *parent, - cmark_reference_map *refmap, int options); + cmark_reference_map *refmap, int options, + cmark_source_map *source_map, bufsize_t total_length); bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_strbuf *input, cmark_reference_map *refmap, - cmark_node *root); + cmark_node *container, + cmark_source_map *source_map); #ifdef __cplusplus } diff --git a/src/parser.h b/src/parser.h index ec8c9b889..7b4fdbc9b 100644 --- a/src/parser.h +++ b/src/parser.h @@ -6,6 +6,7 @@ #include "node.h" #include "buffer.h" #include "memory.h" +#include "source_map.h" #ifdef __cplusplus extern "C" { @@ -30,9 +31,12 @@ struct cmark_parser { bool partially_consumed_tab; cmark_strbuf curline; bufsize_t last_line_length; + bufsize_t line_offset; cmark_strbuf linebuf; int options; bool last_buffer_ended_with_cr; + cmark_source_map *source_map; + cmark_source_extent *last_paragraph_extent; }; #ifdef __cplusplus diff --git a/src/source_map.c b/src/source_map.c new file mode 100644 index 000000000..754c5bb6c --- /dev/null +++ b/src/source_map.c @@ -0,0 +1,344 @@ +#include + +#include "source_map.h" + +cmark_source_map * +source_map_new(cmark_mem *mem) +{ + cmark_source_map *res = (cmark_source_map *) mem->calloc(1, sizeof(cmark_source_map)); + res->mem = mem; + return res; +} + +void +source_map_free(cmark_source_map *self) +{ + if (!self) + return; + + cmark_source_extent *tmp; + for (tmp = self->head; tmp; tmp = source_map_free_extent(self, tmp)); + self->mem->free(self); +} + +cmark_source_extent * +source_map_append_extent(cmark_source_map *self, bufsize_t start, bufsize_t stop, cmark_node *node, cmark_extent_type type) +{ + if (!self) + return NULL; + + assert (start <= stop); + assert (!self->tail || self->tail->stop <= start); + + cmark_source_extent *res = (cmark_source_extent *) self->mem->calloc(1, sizeof(cmark_source_extent)); + + res->start = start; + res->stop = stop; + res->node = node; + res->type = type; + + res->next = NULL; + res->prev = self->tail; + + if (!self->head) + self->head = res; + else + self->tail->next = res; + + self->tail = res; + + return res; +} + +cmark_source_extent * +source_map_insert_extent(cmark_source_map *self, cmark_source_extent *previous, + bufsize_t start, bufsize_t stop, cmark_node *node, cmark_extent_type type) +{ + if (!self) + return NULL; + + if (start == stop) + return previous; + + cmark_source_extent *extent = (cmark_source_extent *) self->mem->calloc(1, sizeof(cmark_source_extent)); + + extent->start = start; + extent->stop = stop; + extent->node = node; + extent->type = type; + extent->next = previous->next; + extent->prev = previous; + previous->next = extent; + + if (extent->next) + extent->next->prev = extent; + else + self->tail = extent; + + return extent; +} + +cmark_source_extent * +source_map_free_extent(cmark_source_map *self, cmark_source_extent *extent) +{ + if (!self) + return NULL; + + cmark_source_extent *next = extent->next; + + if (extent->prev) + extent->prev->next = next; + + if (extent->next) + extent->next->prev = extent->prev; + + if (extent == self->tail) + self->tail = extent->prev; + + if (extent == self->head) + self->head = extent->next; + + if (extent == self->cursor) { + self->cursor = extent->prev; + } + + if (extent == self->next_cursor) { + self->next_cursor = extent->next; + } + + self->mem->free(extent); + + return next; +} + +cmark_source_extent * +source_map_stitch_extent(cmark_source_map *self, cmark_source_extent *extent, + cmark_node *root, cmark_node *target_node, bufsize_t total_length) +{ + cmark_source_extent *res; + + if (!self) + return NULL; + + while (extent->next && extent->start == extent->stop) { + extent = source_map_free_extent(self, extent); + if (extent->node == target_node) + return extent; + } + + if (extent->next) { + res = source_map_insert_extent(self, + extent, + extent->stop, + extent->next->start, + root, + CMARK_EXTENT_BLANK)->next; + } else { + res = source_map_insert_extent(self, + extent, + extent->stop, + total_length, + root, + CMARK_EXTENT_BLANK)->next; + } + + if (extent->start == extent->stop) + source_map_free_extent(self, extent); + + return res; +} + +cmark_source_extent * +source_map_splice_extent(cmark_source_map *self, bufsize_t start, bufsize_t stop, + cmark_node *node, cmark_extent_type type) +{ + if (!self) + return NULL; + + if (!self->next_cursor) { + self->cursor = source_map_insert_extent(self, + self->cursor, + start + self->cursor_offset, + stop + self->cursor_offset, node, type); + + return self->cursor; + } else if (start + self->cursor_offset < self->next_cursor->start && + stop + self->cursor_offset <= self->next_cursor->start) { + self->cursor = source_map_insert_extent(self, + self->cursor, + start + self->cursor_offset, + stop + self->cursor_offset, node, type); + + return self->cursor; + } else if (start + self->cursor_offset < self->next_cursor->start) { + bufsize_t new_start = self->next_cursor->start - self->cursor_offset; + + self->cursor = source_map_insert_extent(self, + self->cursor, + start + self->cursor_offset, + self->next_cursor->start, + node, type); + + if (new_start == stop) + return self->cursor; + + start = new_start; + } + + while (self->next_cursor && start + self->cursor_offset >= self->next_cursor->start) { + self->cursor_offset += self->next_cursor->stop - self->next_cursor->start; + self->cursor = self->cursor->next; + self->next_cursor = self->cursor->next; + } + + return source_map_splice_extent(self, start, stop, node, type); +} + +bool +source_map_start_cursor(cmark_source_map *self, cmark_source_extent *cursor) +{ + if (!self) + return false; + + self->cursor = cursor ? cursor : self->head; + + if (!self->cursor) + return false; + + self->next_cursor = self->cursor->next; + self->cursor_offset = self->cursor->stop; + + return true; +} + +void +source_map_pretty_print(cmark_source_map *self) { + cmark_source_extent *tmp; + + if (!self) + return; + + for (tmp = self->head; tmp; tmp = tmp->next) { + printf ("%d:%d - %s, %s (%p)\n", tmp->start, tmp->stop, + cmark_node_get_type_string(tmp->node), + cmark_source_extent_get_type_string(tmp), + (void *) tmp->node); + } +} + +bool +source_map_check(cmark_source_map *self, bufsize_t total_length) +{ + bufsize_t last_stop = 0; + cmark_source_extent *tmp; + + if (!self) + return true; + + for (tmp = self->head; tmp; tmp = tmp->next) { + if (tmp->start != last_stop) { + return false; + } if (tmp->start == tmp->stop) + return false; + last_stop = tmp->stop; + } + + if (last_stop != total_length) + return false; + + return true; +} + +cmark_source_extent * +source_map_get_cursor(cmark_source_map *self) +{ + if (!self) + return NULL; + + return self->cursor; +} + +cmark_source_extent * +source_map_get_head(cmark_source_map *self) { + if (!self) + return NULL; + + return self->head; +} + +cmark_source_extent * +source_map_get_tail(cmark_source_map *self) +{ + if (!self) + return NULL; + + return self->tail; +} + +size_t +cmark_source_extent_get_start(cmark_source_extent *extent) +{ + return extent->start; +} + +size_t +cmark_source_extent_get_stop(cmark_source_extent *extent) +{ + return extent->stop; +} + +cmark_node * +cmark_source_extent_get_node(cmark_source_extent *extent) +{ + return extent->node; +} + +cmark_source_extent * +cmark_source_extent_get_next(cmark_source_extent *extent) +{ + return extent->next; +} + +cmark_source_extent * +cmark_source_extent_get_previous(cmark_source_extent *extent) +{ + return extent->prev; +} + +cmark_extent_type +cmark_source_extent_get_type(cmark_source_extent *extent) +{ + return extent->type; +} + +const char * +cmark_source_extent_get_type_string(cmark_source_extent *extent) +{ + switch (extent->type) { + case CMARK_EXTENT_NONE: + return "unknown"; + case CMARK_EXTENT_OPENER: + return "opener"; + case CMARK_EXTENT_CLOSER: + return "closer"; + case CMARK_EXTENT_BLANK: + return "blank"; + case CMARK_EXTENT_CONTENT: + return "content"; + case CMARK_EXTENT_PUNCTUATION: + return "punctuation"; + case CMARK_EXTENT_LINK_DESTINATION: + return "link_destination"; + case CMARK_EXTENT_LINK_TITLE: + return "link_title"; + case CMARK_EXTENT_LINK_LABEL: + return "link_label"; + case CMARK_EXTENT_REFERENCE_DESTINATION: + return "reference_destination"; + case CMARK_EXTENT_REFERENCE_LABEL: + return "reference_label"; + case CMARK_EXTENT_REFERENCE_TITLE: + return "reference_title"; + } + return "unknown"; +} diff --git a/src/source_map.h b/src/source_map.h new file mode 100644 index 000000000..de13f8ed7 --- /dev/null +++ b/src/source_map.h @@ -0,0 +1,74 @@ +#ifndef CMARK_SOURCE_MAP_H +#define CMARK_SOURCE_MAP_H + +#include "cmark.h" +#include "config.h" +#include "buffer.h" + +typedef struct _cmark_source_map +{ + cmark_source_extent *head; + cmark_source_extent *tail; + cmark_source_extent *cursor; + cmark_source_extent *next_cursor; + bufsize_t cursor_offset; + cmark_mem *mem; +} cmark_source_map; + +struct cmark_source_extent +{ + bufsize_t start; + bufsize_t stop; + struct cmark_source_extent *next; + struct cmark_source_extent *prev; + cmark_node *node; + cmark_extent_type type; +}; + +cmark_source_map * source_map_new (cmark_mem *mem); + +void source_map_free (cmark_source_map *self); + +bool source_map_check (cmark_source_map *self, + bufsize_t total_length); + +void source_map_pretty_print (cmark_source_map *self); + +cmark_source_extent * source_map_append_extent(cmark_source_map *self, + bufsize_t start, + bufsize_t stop, + cmark_node *node, + cmark_extent_type type); + +cmark_source_extent * source_map_insert_extent(cmark_source_map *self, + cmark_source_extent *previous, + bufsize_t start, + bufsize_t stop, + cmark_node *node, + cmark_extent_type type); + +cmark_source_extent * source_map_free_extent (cmark_source_map *self, + cmark_source_extent *extent); + +cmark_source_extent * source_map_stitch_extent(cmark_source_map *self, + cmark_source_extent *extent, + cmark_node *root, + cmark_node *target_node, + bufsize_t total_length); + +cmark_source_extent * source_map_splice_extent(cmark_source_map *self, + bufsize_t start, + bufsize_t stop, + cmark_node *node, + cmark_extent_type type); + +cmark_source_extent * source_map_get_cursor (cmark_source_map *self); + +cmark_source_extent * source_map_get_head (cmark_source_map *self); + +cmark_source_extent * source_map_get_tail (cmark_source_map *self); + +bool source_map_start_cursor (cmark_source_map *self, + cmark_source_extent *cursor); + +#endif diff --git a/test/cmark.py b/test/cmark.py index f4ff5765b..fd35d54bb 100644 --- a/test/cmark.py +++ b/test/cmark.py @@ -6,6 +6,8 @@ import platform import os +OPT_SOURCEPOS = 1 << 1 + def pipe_through_prog(prog, text): p1 = Popen(prog.split(), stdout=PIPE, stdin=PIPE, stderr=PIPE) [result, err] = p1.communicate(input=text.encode('utf-8')) @@ -29,7 +31,8 @@ def to_commonmark(lib, text): render_commonmark = lib.cmark_render_commonmark render_commonmark.restype = c_char_p render_commonmark.argtypes = [c_void_p, c_int, c_int] - node = parse_document(textbytes, textlen, 0) + # We want tests to go through the source map code + node = parse_document(textbytes, textlen, OPT_SOURCEPOS) if node is None: raise Exception("parse_document failed") result = render_commonmark(node, 0, 0).decode('utf-8') From a44cfaf95bfd12401622786da0f4d7fd9f942386 Mon Sep 17 00:00:00 2001 From: Mathieu Duponchelle Date: Wed, 4 Jan 2017 22:28:15 +0100 Subject: [PATCH 4/5] wrapper.py: more extensive coverage Add cmark_default_mem_free(pointer). API change. --- src/cmark.c | 5 + src/cmark.h | 5 + test/CMakeLists.txt | 17 + test/test_cmark.py | 517 ++++++++++++++++++++++++ wrappers/wrapper.py | 963 ++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 1476 insertions(+), 31 deletions(-) create mode 100644 test/test_cmark.py mode change 100755 => 100644 wrappers/wrapper.py diff --git a/src/cmark.c b/src/cmark.c index ebd933312..da93abe21 100644 --- a/src/cmark.c +++ b/src/cmark.c @@ -24,6 +24,11 @@ static void *xrealloc(void *ptr, size_t size) { return new_ptr; } +void cmark_default_mem_free(void *ptr) +{ + free(ptr); +} + cmark_mem DEFAULT_MEM_ALLOCATOR = {xcalloc, xrealloc, free}; char *cmark_markdown_to_html(const char *text, size_t len, int options) { diff --git a/src/cmark.h b/src/cmark.h index cc1e7f527..322ac9601 100644 --- a/src/cmark.h +++ b/src/cmark.h @@ -123,6 +123,11 @@ typedef struct cmark_mem { void (*free)(void *); } cmark_mem; +/** Convenience function for bindings. + */ +CMARK_EXPORT +void cmark_default_mem_free(void *ptr); + /** * ## Creating and Destroying Nodes */ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6da3a6bac..32497fb0a 100755 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -78,3 +78,20 @@ ELSE(PYTHONINTERP_FOUND) ENDIF(PYTHONINTERP_FOUND) +if (PYTHON_BINDING_TESTS) + find_package(PythonInterp 3 REQUIRED) +else(PYTHON_BINDING_TESTS) + find_package(PythonInterp 3) +endif(PYTHON_BINDING_TESTS) + +IF (PYTHONINTERP_FOUND) + add_test(python3_bindings + ${PYTHON_EXECUTABLE} + "${CMAKE_CURRENT_SOURCE_DIR}/test_cmark.py" + "${CMAKE_CURRENT_BINARY_DIR}/../src" + ) +ELSE(PYTHONINTERP_FOUND) + message("\n*** A python 3 interpreter is required to run the python binding tests.\n") + add_test(skipping_python_binding_tests + echo "Skipping python binding tests, because no python 3 interpreter is available.") +ENDIF(PYTHONINTERP_FOUND) diff --git a/test/test_cmark.py b/test/test_cmark.py new file mode 100644 index 000000000..e86e38bab --- /dev/null +++ b/test/test_cmark.py @@ -0,0 +1,517 @@ +# -*- coding: utf8 -*- + +from __future__ import unicode_literals + +import sys +import os +import unittest +import argparse + +here = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.join(here, os.pardir, 'wrappers')) +from wrapper import * + +class TestHighLevel(unittest.TestCase): + def test_markdown_to_html(self): + self.assertEqual(markdown_to_html('foo'), '

foo

\n') + + def test_parse_document(self): + doc = parse_document('foo') + self.assertEqual(type(doc), Document) + +class TestParser(unittest.TestCase): + def test_lifecycle(self): + parser = Parser() + del parser + + def test_feed(self): + parser = Parser() + parser.feed('‘') + + def test_finish(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + + def test_source_map(self): + parser = Parser(options=Parser.OPT_SOURCEPOS) + parser.feed('‘') + doc = parser.finish() + source_map = parser.get_source_map() + extents = [e for e in source_map] + self.assertEqual(len(extents), 1) + self.assertEqual(extents[0].type, ExtentType.CONTENT) + self.assertEqual(extents[0].start, 0) + self.assertEqual(extents[0].stop, 3) + + def test_render_html(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_html() + self.assertEqual(res, '

‘

\n') + + def test_render_xml(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_xml() + self.assertEqual( + res, + '\n' + '\n' + '\n' + ' \n' + ' ‘\n' + ' \n' + '\n') + + def test_render_commonmark(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_commonmark() + self.assertEqual(res, '‘\n') + + def test_render_man(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_man() + self.assertEqual( + res, + '.PP\n' + '\[oq]\n') + + def test_render_latex(self): + parser = Parser() + parser.feed('‘') + doc = parser.finish() + res = doc.to_latex() + self.assertEqual(res, '`\n') + +class TestNode(unittest.TestCase): + def test_type(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + self.assertEqual(type(doc), Document) + + def test_equal(self): + parser = Parser() + parser.feed('foo\n\nbar') + doc = parser.finish() + para_one = doc.first_child + para_two = doc.last_child + self.assertEqual(doc.last_child, para_one.next) + self.assertEqual(para_one != para_two, True) + + def test_first_child(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + child1 = doc.first_child + child2 = doc.first_child + self.assertEqual(child1, child2) + self.assertEqual((child1 != child2), False) + + def test_last_child(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + child1 = doc.first_child + child2 = doc.last_child + self.assertEqual(child1, child2) + self.assertEqual((child1 != child2), False) + + def test_next(self): + parser = Parser() + parser.feed('foo *bar*') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + text = para.first_child + self.assertEqual(type(text), Text) + emph = text.next + self.assertEqual(type(emph), Emph) + self.assertEqual(para.next, None) + + def test_previous(self): + parser = Parser() + parser.feed('foo *bar*') + doc = parser.finish() + para = doc.first_child + text = para.first_child + emph = text.next + self.assertEqual(emph.previous, text) + self.assertEqual(para.previous, None) + + def test_children(self): + parser = Parser() + parser.feed('foo *bar*') + doc = parser.finish() + para = doc.first_child + children = [c for c in para] + self.assertEqual(len(children), 2) + self.assertEqual(type(children[0]), Text) + self.assertEqual(type(children[1]), Emph) + + # Test unlinking while iterating + + children = [] + for c in para: + children.append(c) + c.unlink() + + self.assertEqual(len(children), 2) + self.assertEqual(type(children[0]), Text) + self.assertEqual(type(children[1]), Emph) + + def test_parent(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + para = doc.first_child + self.assertEqual(para.parent, doc) + + def test_new(self): + with self.assertRaises(NotImplementedError): + n = Node() + + def test_unlink(self): + parser = Parser() + parser.feed('foo *bar*') + doc = parser.finish() + para = doc.first_child + para.unlink() + self.assertEqual(doc.to_html(), '') + + def test_append_child(self): + parser = Parser() + parser.feed('') + doc = parser.finish() + doc.append_child(Paragraph()) + self.assertEqual(doc.to_html(), '

\n') + with self.assertRaises(LibcmarkError): + doc.append_child(Text(literal='foo')) + + def test_prepend_child(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + doc.prepend_child(Paragraph()) + self.assertEqual(doc.to_html(), '

\n

foo

\n') + with self.assertRaises(LibcmarkError): + doc.prepend_child(Text(literal='foo')) + + def test_insert_before(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + para = doc.first_child + para.insert_before(Paragraph()) + self.assertEqual(doc.to_html(), '

\n

foo

\n') + with self.assertRaises(LibcmarkError): + para.insert_before(Text(literal='foo')) + + def test_insert_after(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + para = doc.first_child + para.insert_after(Paragraph()) + self.assertEqual(doc.to_html(), '

foo

\n

\n') + with self.assertRaises(LibcmarkError): + para.insert_after(Text(literal='foo')) + + def test_consolidate_text_nodes(self): + parser = Parser() + parser.feed('foo **bar*') + doc = parser.finish() + self.assertEqual(len([c for c in doc.first_child]), 3) + doc.consolidate_text_nodes() + self.assertEqual(len([c for c in doc.first_child]), 2) + +class TestLiteral(unittest.TestCase): + def test_text(self): + parser = Parser() + parser.feed('foo') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + text = para.first_child + self.assertEqual(type(text), Text) + self.assertEqual(text.literal, 'foo') + text.literal = 'bar' + self.assertEqual(text.to_html(), 'bar') + +class TestDocument(unittest.TestCase): + def test_new(self): + doc = Document() + self.assertEqual(doc.to_html(), + '') + +class TestBlockQuote(unittest.TestCase): + def test_new(self): + bq = BlockQuote() + self.assertEqual(bq.to_html(), + '
\n
\n') + +class TestList(unittest.TestCase): + def test_new(self): + list_ = List() + self.assertEqual(list_.to_html(), + '
    \n
\n') + + def test_type(self): + parser = Parser() + parser.feed('* foo') + doc = parser.finish() + list_ = doc.first_child + self.assertEqual(type(list_), List) + self.assertEqual(list_.type, ListType.BULLET) + list_.type = ListType.ORDERED + self.assertEqual(doc.to_html(), + '
    \n' + '
  1. foo
  2. \n' + '
\n') + + def test_start(self): + parser = Parser() + parser.feed('2. foo') + doc = parser.finish() + list_ = doc.first_child + self.assertEqual(type(list_), List) + self.assertEqual(list_.start, 2) + list_.start = 1 + self.assertEqual(doc.to_commonmark(), + '1. foo\n') + with self.assertRaises(LibcmarkError): + list_.start = -1 + list_.type = ListType.BULLET + + def test_delim(self): + parser = Parser() + parser.feed('1. foo') + doc = parser.finish() + list_ = doc.first_child + self.assertEqual(type(list_), List) + self.assertEqual(list_.delim, '.') + list_.delim = ')' + self.assertEqual(doc.to_commonmark(), + '1) foo\n') + + def test_tight(self): + parser = Parser() + parser.feed('* foo\n' + '\n' + '* bar\n') + doc = parser.finish() + list_ = doc.first_child + self.assertEqual(type(list_), List) + self.assertEqual(list_.tight, False) + self.assertEqual(doc.to_commonmark(), + ' - foo\n' + '\n' + ' - bar\n') + + list_.tight = True + self.assertEqual(doc.to_commonmark(), + ' - foo\n' + ' - bar\n') + + with self.assertRaises(LibcmarkError): + list_.tight = 42 + +class TestItem(unittest.TestCase): + def test_new(self): + item = Item() + self.assertEqual(item.to_html(), + '
  • \n') + +class TestCodeBlock(unittest.TestCase): + def test_new(self): + cb = CodeBlock(literal='foo', fence_info='python') + self.assertEqual(cb.to_html(), + '
    foo
    \n') + + def test_fence_info(self): + parser = Parser() + parser.feed('``` markdown\n' + 'hello\n' + '```\n') + doc = parser.finish() + code_block = doc.first_child + self.assertEqual(type(code_block), CodeBlock) + self.assertEqual(code_block.fence_info, 'markdown') + code_block.fence_info = 'python' + self.assertEqual(doc.to_commonmark(), + '``` python\n' + 'hello\n' + '```\n') + +class TestHtmlBlock(unittest.TestCase): + def test_new(self): + hb = HtmlBlock(literal='

    foo

    ') + self.assertEqual(hb.to_html(), + '

    foo

    \n') + +class TestCustomBlock(unittest.TestCase): + def test_new(self): + cb = CustomBlock() + self.assertEqual(cb.to_html(), + '') + +class TestParagraph(unittest.TestCase): + def test_new(self): + para = Paragraph() + self.assertEqual(para.to_html(), + '

    \n') + +class TestHeading(unittest.TestCase): + def test_new(self): + heading = Heading(level=3) + self.assertEqual(heading.to_html(), + '

    \n') + + def test_level(self): + parser = Parser() + parser.feed('# foo') + doc = parser.finish() + heading = doc.first_child + self.assertEqual(type(heading), Heading) + self.assertEqual(heading.level, 1) + heading.level = 3 + self.assertEqual(heading.level, 3) + + self.assertEqual(doc.to_html(), + '

    foo

    \n') + + with self.assertRaises(LibcmarkError): + heading.level = 10 + +class TestThematicBreak(unittest.TestCase): + def test_new(self): + tb = ThematicBreak() + self.assertEqual(tb.to_html(), + '
    \n') + +class TestText(unittest.TestCase): + def test_new(self): + text = Text(literal='foo') + self.assertEqual(text.to_html(), + 'foo') + +class TestSoftBreak(unittest.TestCase): + def test_new(self): + sb = SoftBreak() + self.assertEqual(sb.to_html(), '\n') + self.assertEqual(sb.to_html(options=Parser.OPT_HARDBREAKS), + '
    \n') + self.assertEqual(sb.to_html(options=Parser.OPT_NOBREAKS), + ' ') + +class TestLineBreak(unittest.TestCase): + def test_new(self): + lb = LineBreak() + self.assertEqual(lb.to_html(), '
    \n') + +class TestCode(unittest.TestCase): + def test_new(self): + code = Code(literal='bar') + self.assertEqual(code.to_html(), 'bar') + +class TestHtmlInline(unittest.TestCase): + def test_new(self): + hi = HtmlInline(literal='baz') + self.assertEqual(hi.to_html(), 'baz') + +class TestCustomInline(unittest.TestCase): + def test_new(self): + ci = CustomInline() + self.assertEqual(ci.to_html(), + '') + +class TestEmph(unittest.TestCase): + def test_new(self): + emph = Emph() + self.assertEqual(emph.to_html(), + '') + +class TestStrong(unittest.TestCase): + def test_new(self): + strong = Strong() + self.assertEqual(strong.to_html(), + '') + +class TestLink(unittest.TestCase): + def test_new(self): + link = Link(url='http://foo.com', title='foo') + self.assertEqual(link.to_html(), + '') + + def test_url(self): + parser = Parser() + parser.feed('\n') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + link = para.first_child + self.assertEqual(type(link), Link) + self.assertEqual(link.url, 'http://foo.com') + link.url = 'http://bar.net' + # Yeah that's crappy behaviour but not our problem here + self.assertEqual(doc.to_commonmark(), + '[http://foo.com](http://bar.net)\n') + + def test_title(self): + parser = Parser() + parser.feed('\n') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + link = para.first_child + self.assertEqual(type(link), Link) + self.assertEqual(link.title, '') + link.title = 'foo' + self.assertEqual(doc.to_html(), + '

    http://foo.com

    \n') + +class TestImage(unittest.TestCase): + def test_new(self): + image = Image(url='http://foo.com', title='foo') + self.assertEqual(image.to_html(), + '') + + def test_url(self): + parser = Parser() + parser.feed('![image](image.com)\n') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + link = para.first_child + self.assertEqual(type(link), Image) + self.assertEqual(link.url, 'image.com') + link.url = 'http://bar.net' + self.assertEqual(doc.to_commonmark(), + '![image](http://bar.net)\n') + + def test_title(self): + parser = Parser() + parser.feed('![image](image.com "ze image")\n') + doc = parser.finish() + para = doc.first_child + self.assertEqual(type(para), Paragraph) + image = para.first_child + self.assertEqual(type(image), Image) + self.assertEqual(image.title, 'ze image') + image.title = 'foo' + self.assertEqual(doc.to_html(), + '

    image

    \n') + +if __name__=='__main__': + parser = argparse.ArgumentParser() + parser.add_argument('libdir') + args = parser.parse_known_args() + conf.set_library_path(args[0].libdir) + unittest.main(argv=[sys.argv[0]] + args[1]) diff --git a/wrappers/wrapper.py b/wrappers/wrapper.py old mode 100755 new mode 100644 index 98e7f2b46..048d33bf5 --- a/wrappers/wrapper.py +++ b/wrappers/wrapper.py @@ -1,37 +1,938 @@ -#!/usr/bin/env python +from __future__ import unicode_literals -# Example for using the shared library from python -# Will work with either python 2 or python 3 -# Requires cmark library to be installed - -from ctypes import CDLL, c_char_p, c_long +from ctypes import * import sys import platform -sysname = platform.system() +c_object_p = POINTER(c_void_p) -if sysname == 'Darwin': - libname = "libcmark.dylib" -elif sysname == 'Windows': - libname = "cmark.dll" +if sys.version_info[0] > 2: + def bytes_and_length(text): + if type(text) == str: + text = text.encode("utf8") + return text, len(text) else: - libname = "libcmark.so" -cmark = CDLL(libname) - -markdown = cmark.cmark_markdown_to_html -markdown.restype = c_char_p -markdown.argtypes = [c_char_p, c_long, c_long] - -opts = 0 # defaults - -def md2html(text): - if sys.version_info >= (3,0): - textbytes = text.encode('utf-8') - textlen = len(textbytes) - return markdown(textbytes, textlen, opts).decode('utf-8') - else: - textbytes = text - textlen = len(text) - return markdown(textbytes, textlen, opts) - -sys.stdout.write(md2html(sys.stdin.read())) + def bytes_and_length(text): + if type(text) == unicode: + text = text.encode("utf8") + return text, len(text) + +def unicode_from_char_p(res, fn, args): + ret = res.decode("utf8") + return ret + +class owned_char_p(c_void_p): + def __del__(self): + conf.lib.cmark_default_mem_free(self.value) + +def unicode_from_owned_char_p(res, fn, args): + ret = cast(res, c_char_p).value.decode("utf8") + return ret + +def boolean_from_result(res, fn, args): + return bool(res) + +def delim_from_int(res, fn, args): + if res == 0: + return '' + elif res == 1: + return '.' + elif res == 2: + return ')' + +class BaseEnumeration(object): + def __init__(self, value): + if value >= len(self.__class__._kinds): + self.__class__._kinds += [None] * (value - len(self.__class__._kinds) + 1) + if self.__class__._kinds[value] is not None: + raise ValueError('{0} value {1} already loaded'.format( + str(self.__class__), value)) + self.value = value + self.__class__._kinds[value] = self + self.__class__._name_map = None + + def from_param(self): + return self.value + + @classmethod + def from_id(cls, id, fn, args): + if id >= len(cls._kinds) or cls._kinds[id] is None: + raise ValueError('Unknown template argument kind %d' % id) + return cls._kinds[id] + + @property + def name(self): + """Get the enumeration name of this cursor kind.""" + if self._name_map is None: + self._name_map = {} + for key, value in self.__class__.__dict__.items(): + if isinstance(value, self.__class__): + self._name_map[value] = key + return str(self._name_map[self]) + + def __repr__(self): + return '%s.%s' % (self.__class__.__name__, self.name,) + +class Parser(object): + OPT_DEFAULT = 0 + OPT_SOURCEPOS = 1 << 1 + OPT_HARDBREAKS = 1 << 2 + OPT_SAFE = 1 << 3 + OPT_NOBREAKS = 1 << 4 + OPT_NORMALIZE = 1 << 8 + OPT_VALIDATE_UTF8 = 1 << 9 + OPT_SMART = 1 << 10 + + def __init__(self, options=0): + self._parser = conf.lib.cmark_parser_new(options) + + def __del__(self): + conf.lib.cmark_parser_free(self._parser) + + def feed(self, text): + conf.lib.cmark_parser_feed(self._parser, *bytes_and_length(text)) + + def finish(self): + return conf.lib.cmark_parser_finish(self._parser) + + def get_source_map(self): + return conf.lib.cmark_parser_get_first_source_extent(self._parser) + +class LibcmarkError(Exception): + def __init__(self, message): + self.m = message + + def __str__(self): + return self.m + +class NodeType(BaseEnumeration): + _kinds = [] + _name_map = None + +# FIXME: a bit awkward to update, not sure what the best practice is +NodeType.NONE = NodeType(0) +NodeType.DOCUMENT = NodeType(1) +NodeType.BLOCK_QUOTE = NodeType(2) +NodeType.LIST = NodeType(3) +NodeType.ITEM = NodeType(4) +NodeType.CODE_BLOCK = NodeType(5) +NodeType.HTML_BLOCK = NodeType(6) +NodeType.CUSTOM_BLOCK = NodeType(7) +NodeType.PARAGRAPH = NodeType(8) +NodeType.HEADING = NodeType(9) +NodeType.THEMATIC_BREAK = NodeType(10) +NodeType.REFERENCE = NodeType(11) +NodeType.TEXT = NodeType(12) +NodeType.SOFTBREAK = NodeType(13) +NodeType.LINEBREAK = NodeType(14) +NodeType.CODE = NodeType(15) +NodeType.HTML_INLINE = NodeType(16) +NodeType.CUSTOM_INLINE = NodeType(17) +NodeType.EMPH = NodeType(18) +NodeType.STRONG = NodeType(19) +NodeType.LINK = NodeType(20) +NodeType.IMAGE = NodeType(21) + +class ListType(BaseEnumeration): + _kinds = [] + _name_map = None + +ListType.BULLET = ListType(1) +ListType.ORDERED = ListType(2) + +class Node(object): + __subclass_map = {} + + def __init__(self): + self._owned = False + raise NotImplementedError + + @staticmethod + def from_result(res, fn=None, args=None): + try: + res.contents + except ValueError: + return None + cls = Node.get_subclass_map()[conf.lib.cmark_node_get_type(res)] + + ret = cls.__new__(cls) + ret._node = res + ret._owned = False + return ret + + @classmethod + def get_subclass_map(cls): + if cls.__subclass_map: + return cls.__subclass_map + + res = {c._node_type: c for c in cls.__subclasses__()} + + for c in cls.__subclasses__(): + res.update(c.get_subclass_map()) + + return res + + def unlink(self): + conf.lib.cmark_node_unlink(self._node) + self._owned = True + + def append_child(self, child): + res = conf.lib.cmark_node_append_child(self._node, child._node) + if not res: + raise LibcmarkError("Can't append child %s to node %s" % (str(child), str(self))) + child._owned = False + + def prepend_child(self, child): + res = conf.lib.cmark_node_prepend_child(self._node, child._node) + if not res: + raise LibcmarkError("Can't prepend child %s to node %s" % (str(child), str(self))) + child._owned = False + + def insert_before(self, sibling): + res = conf.lib.cmark_node_insert_before(self._node, sibling._node) + if not res: + raise LibcmarkError("Can't insert sibling %s before node %s" % (str(sibling), str(self))) + sibling._owned = False + + def insert_after(self, sibling): + res = conf.lib.cmark_node_insert_after(self._node, sibling._node) + if not res: + raise LibcmarkError("Can't insert sibling %s after node %s" % (str(sibling), str(self))) + sibling._owned = False + + def consolidate_text_nodes(self): + conf.lib.cmark_consolidate_text_nodes(self._node) + + def to_html(self, options=Parser.OPT_DEFAULT): + return conf.lib.cmark_render_html(self._node, options) + + def to_xml(self, options=Parser.OPT_DEFAULT): + return conf.lib.cmark_render_xml(self._node, options) + + def to_commonmark(self, options=Parser.OPT_DEFAULT, width=0): + return conf.lib.cmark_render_commonmark(self._node, options, width) + + def to_man(self, options=Parser.OPT_DEFAULT, width=0): + return conf.lib.cmark_render_man(self._node, options, width) + + def to_latex(self, options=Parser.OPT_DEFAULT, width=0): + return conf.lib.cmark_render_latex(self._node, options, width) + + @property + def parent(self): + return conf.lib.cmark_node_parent(self._node) + + @property + def first_child(self): + return conf.lib.cmark_node_first_child(self._node) + + @property + def last_child(self): + return conf.lib.cmark_node_last_child(self._node) + + @property + def next(self): + return conf.lib.cmark_node_next(self._node) + + @property + def previous(self): + return conf.lib.cmark_node_previous(self._node) + + def __eq__(self, other): + if other is None: + return False + return addressof(self._node.contents) == addressof(other._node.contents) + + def __ne__(self, other): + if other is None: + return True + return addressof(self._node.contents) != addressof(other._node.contents) + + def __hash__(self): + return hash(addressof(self._node.contents)) + + def __del__(self): + if self._owned: + conf.lib.cmark_node_free(self._node) + + def __iter__(self): + cur = self.first_child + while (cur): + next_ = cur.next + yield cur + cur = next_ + +class Literal(Node): + _node_type = NodeType.NONE + + @property + def literal(self): + return conf.lib.cmark_node_get_literal(self._node) + + @literal.setter + def literal(self, value): + bytes_, _ = bytes_and_length(value) + if not conf.lib.cmark_node_set_literal(self._node, bytes_): + raise LibcmarkError("Invalid literal %s\n" % str(value)) + +class Document(Node): + _node_type = NodeType.DOCUMENT + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class BlockQuote(Node): + _node_type = NodeType.BLOCK_QUOTE + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class List(Node): + _node_type = NodeType.LIST + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + @property + def type(self): + return conf.lib.cmark_node_get_list_type(self._node) + + @type.setter + def type(self, type_): + if not conf.lib.cmark_node_set_list_type(self._node, type_.value): + raise LibcmarkError("Invalid type %s" % str(type_)) + + @property + def delim(self): + return conf.lib.cmark_node_get_list_delim(self._node) + + @delim.setter + def delim(self, value): + if value == '.': + delim_type = 1 + elif value == ')': + delim_type = 2 + else: + raise LibcmarkError('Invalid delim type %s' % str(value)) + + conf.lib.cmark_node_set_list_delim(self._node, delim_type) + + @property + def start(self): + return conf.lib.cmark_node_get_list_start(self._node) + + @start.setter + def start(self, value): + if not conf.lib.cmark_node_set_list_start(self._node, value): + raise LibcmarkError("Invalid list start %s\n" % str(value)) + + @property + def tight(self): + return conf.lib.cmark_node_get_list_tight(self._node) + + @tight.setter + def tight(self, value): + if value is True: + tightness = 1 + elif value is False: + tightness = 0 + else: + raise LibcmarkError("Invalid list tightness %s\n" % str(value)) + if not conf.lib.cmark_node_set_list_tight(self._node, tightness): + raise LibcmarkError("Invalid list tightness %s\n" % str(value)) + +class Item(Node): + _node_type = NodeType.ITEM + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class CodeBlock(Literal): + _node_type = NodeType.CODE_BLOCK + + def __init__(self, literal='', fence_info=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + self.fence_info = fence_info + + @property + def fence_info(self): + return conf.lib.cmark_node_get_fence_info(self._node) + + @fence_info.setter + def fence_info(self, value): + bytes_, _ = bytes_and_length(value) + if not conf.lib.cmark_node_set_fence_info(self._node, bytes_): + raise LibcmarkError("Invalid fence info %s\n" % str(value)) + +class HtmlBlock(Literal): + _node_type = NodeType.HTML_BLOCK + + def __init__(self, literal=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + + +class CustomBlock(Node): + _node_type = NodeType.CUSTOM_BLOCK + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class Paragraph(Node): + _node_type = NodeType.PARAGRAPH + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class Heading(Node): + _node_type = NodeType.HEADING + + def __init__(self, level=1): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self.level = level + self._owned = True + + @property + def level(self): + return int(conf.lib.cmark_node_get_heading_level(self._node)) + + @level.setter + def level(self, value): + res = conf.lib.cmark_node_set_heading_level(self._node, value) + if (res == 0): + raise LibcmarkError("Invalid heading level %s" % str(value)) + +class ThematicBreak(Node): + _node_type = NodeType.THEMATIC_BREAK + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class Reference(Node): + _node_type = NodeType.REFERENCE + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class Text(Literal): + _node_type = NodeType.TEXT + + def __init__(self, literal=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + + +class SoftBreak(Node): + _node_type = NodeType.SOFTBREAK + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class LineBreak(Node): + _node_type = NodeType.LINEBREAK + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class Code(Literal): + _node_type = NodeType.CODE + + def __init__(self, literal=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + + +class HtmlInline(Literal): + _node_type = NodeType.HTML_INLINE + + def __init__(self, literal=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.literal = literal + + +class CustomInline(Node): + _node_type = NodeType.CUSTOM_INLINE + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class Emph(Node): + _node_type = NodeType.EMPH + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + +class Strong(Node): + _node_type = NodeType.STRONG + + def __init__(self): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + + +class Link(Node): + _node_type = NodeType.LINK + + def __init__(self, url='', title=''): + self._node = conf.lib.cmark_node_new(self.__class__._node_type.value) + self._owned = True + self.url = url + self.title = title + + @property + def url(self): + return conf.lib.cmark_node_get_url(self._node) + + @url.setter + def url(self, value): + bytes_, _ = bytes_and_length(value) + if not conf.lib.cmark_node_set_url(self._node, bytes_): + raise LibcmarkError("Invalid url %s\n" % str(value)) + + @property + def title(self): + return conf.lib.cmark_node_get_title(self._node) + + @title.setter + def title(self, value): + bytes_, _ = bytes_and_length(value) + if not conf.lib.cmark_node_set_title(self._node, bytes_): + raise LibcmarkError("Invalid title %s\n" % str(value)) + +class Image(Link): + _node_type = NodeType.IMAGE + +class ExtentType(BaseEnumeration): + _kinds = [] + _name_map = None + +ExtentType.NONE = ExtentType(0) +ExtentType.OPENER = ExtentType(1) +ExtentType.CLOSER = ExtentType(2) +ExtentType.BLANK = ExtentType(3) +ExtentType.CONTENT = ExtentType(4) +ExtentType.PUNCTUATION = ExtentType(5) +ExtentType.LINK_DESTINATION = ExtentType(6) +ExtentType.LINK_TITLE = ExtentType(7) +ExtentType.LINK_LABEL = ExtentType(8) +ExtentType.REFERENCE_DESTINATION = ExtentType(9) +ExtentType.REFERENCE_LABEL = ExtentType(10) +ExtentType.REFERENCE_TITLE = ExtentType(11) + +class Extent(object): + @staticmethod + def from_result(res, fn=None, args=None): + ret = Extent() + ret._extent = res + return ret + + @property + def start(self): + return conf.lib.cmark_source_extent_get_start(self._extent) + + @property + def stop(self): + return conf.lib.cmark_source_extent_get_stop(self._extent) + + @property + def type(self): + return conf.lib.cmark_source_extent_get_type(self._extent) + + @property + def node(self): + return conf.lib.cmark_source_extent_get_node(self._extent) + +class SourceMap(object): + @staticmethod + def from_result(res, fn, args): + ret = SourceMap() + ret._root = res + return ret + + def __iter__(self): + cur = self._root + while (cur): + yield Extent.from_result(cur) + cur = conf.lib.cmark_source_extent_get_next(cur) + +def markdown_to_html(text, options=Parser.OPT_DEFAULT): + bytes_, length = bytes_and_length(text) + return conf.lib.cmark_markdown_to_html(bytes_, length, options) + +def parse_document(text, options=Parser.OPT_DEFAULT): + bytes_, length = bytes_and_length(text) + return conf.lib.cmark_parse_document(bytes_, length, options) + +functionList = [ + ("cmark_default_mem_free", + [c_void_p]), + ("cmark_markdown_to_html", + [c_char_p, c_long, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_parse_document", + [c_char_p, c_long, c_int], + c_object_p, + Node.from_result), + ("cmark_parser_new", + [c_int], + c_object_p), + ("cmark_parser_free", + [c_object_p]), + ("cmark_parser_feed", + [c_object_p, c_char_p, c_long]), + ("cmark_parser_finish", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_parser_get_first_source_extent", + [c_object_p], + c_object_p, + SourceMap.from_result), + ("cmark_source_extent_get_next", + [c_object_p], + c_object_p), + ("cmark_source_extent_get_start", + [c_object_p], + c_ulonglong), + ("cmark_source_extent_get_stop", + [c_object_p], + c_ulonglong), + ("cmark_source_extent_get_type", + [c_object_p], + c_int, + ExtentType.from_id), + ("cmark_source_extent_get_node", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_render_html", + [c_object_p, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_render_xml", + [c_object_p, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_render_commonmark", + [c_object_p, c_int, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_render_man", + [c_object_p, c_int, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_render_latex", + [c_object_p, c_int, c_int], + owned_char_p, + unicode_from_owned_char_p), + ("cmark_node_new", + [c_int], + c_object_p), + ("cmark_node_free", + [c_object_p]), + ("cmark_node_get_type", + [c_object_p], + c_int, + NodeType.from_id), + ("cmark_node_parent", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_first_child", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_last_child", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_next", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_previous", + [c_object_p], + c_object_p, + Node.from_result), + ("cmark_node_unlink", + [c_object_p]), + ("cmark_node_append_child", + [c_object_p, c_object_p], + c_int, + boolean_from_result), + ("cmark_node_prepend_child", + [c_object_p, c_object_p], + c_int, + boolean_from_result), + ("cmark_node_insert_before", + [c_object_p, c_object_p], + c_int, + boolean_from_result), + ("cmark_node_insert_after", + [c_object_p, c_object_p], + c_int, + boolean_from_result), + ("cmark_consolidate_text_nodes", + [c_object_p]), + ("cmark_node_get_literal", + [c_object_p], + c_char_p, + unicode_from_char_p), + ("cmark_node_set_literal", + [c_object_p, c_char_p], + c_int, + boolean_from_result), + ("cmark_node_get_heading_level", + [c_object_p], + c_int), + ("cmark_node_set_heading_level", + [c_object_p, c_int], + c_int, + boolean_from_result), + ("cmark_node_get_list_type", + [c_object_p], + c_int, + ListType.from_id), + ("cmark_node_set_list_type", + [c_object_p], + c_int, + boolean_from_result), + ("cmark_node_get_list_delim", + [c_object_p], + c_int, + delim_from_int), + ("cmark_node_set_list_delim", + [c_object_p, c_int], + c_int), + ("cmark_node_get_list_start", + [c_object_p], + c_int), + ("cmark_node_set_list_start", + [c_object_p, c_int], + c_int, + boolean_from_result), + ("cmark_node_get_list_tight", + [c_object_p], + c_int, + boolean_from_result), + ("cmark_node_set_list_tight", + [c_object_p, c_int], + c_int, + boolean_from_result), + ("cmark_node_get_fence_info", + [c_object_p], + c_char_p, + unicode_from_char_p), + ("cmark_node_set_fence_info", + [c_object_p, c_char_p], + c_int, + boolean_from_result), + ("cmark_node_get_url", + [c_object_p], + c_char_p, + unicode_from_char_p), + ("cmark_node_set_url", + [c_object_p, c_char_p], + c_int, + boolean_from_result), + ("cmark_node_get_title", + [c_object_p], + c_char_p, + unicode_from_char_p), + ("cmark_node_set_title", + [c_object_p, c_char_p], + c_int, + boolean_from_result), +] + +# Taken from clang.cindex +def register_function(lib, item, ignore_errors): + # A function may not exist, if these bindings are used with an older or + # incompatible version of libcmark.so. + try: + func = getattr(lib, item[0]) + except AttributeError as e: + msg = str(e) + ". Please ensure that your python bindings are "\ + "compatible with your libcmark version." + if ignore_errors: + return + raise LibcmarkError(msg) + + if len(item) >= 2: + func.argtypes = item[1] + + if len(item) >= 3: + func.restype = item[2] + + if len(item) == 4: + func.errcheck = item[3] + +def register_functions(lib, ignore_errors): + """Register function prototypes with a libccmark library instance. + + This must be called as part of library instantiation so Python knows how + to call out to the shared library. + """ + + def register(item): + return register_function(lib, item, ignore_errors) + + for f in functionList: + register(f) + +class Config: + library_path = None + library_file = None + compatibility_check = True + loaded = False + lib_ = None + + @staticmethod + def set_library_path(path): + """Set the path in which to search for libcmark""" + if Config.loaded: + raise Exception("library path must be set before before using " \ + "any other functionalities in libcmark.") + + Config.library_path = path + + @staticmethod + def set_library_file(filename): + """Set the exact location of libcmark""" + if Config.loaded: + raise Exception("library file must be set before before using " \ + "any other functionalities in libcmark.") + + Config.library_file = filename + + @staticmethod + def set_compatibility_check(check_status): + """ Perform compatibility check when loading libcmark + + The python bindings are only tested and evaluated with the version of + libcmark they are provided with. To ensure correct behavior a (limited) + compatibility check is performed when loading the bindings. This check + will throw an exception, as soon as it fails. + + In case these bindings are used with an older version of libcmark, parts + that have been stable between releases may still work. Users of the + python bindings can disable the compatibility check. This will cause + the python bindings to load, even though they are written for a newer + version of libcmark. Failures now arise if unsupported or incompatible + features are accessed. The user is required to test themselves if the + features they are using are available and compatible between different + libcmark versions. + """ + if Config.loaded: + raise Exception("compatibility_check must be set before before " \ + "using any other functionalities in libcmark.") + + Config.compatibility_check = check_status + + @property + def lib(self): + if self.lib_: + return self.lib_ + lib = self.get_cmark_library() + register_functions(lib, not Config.compatibility_check) + Config.loaded = True + self.lib_ = lib + return lib + + def get_filename(self): + if Config.library_file: + return Config.library_file + + import platform + name = platform.system() + + if name == 'Darwin': + file = 'libcmark.dylib' + elif name == 'Windows': + file = 'cmark.dll' + else: + file = 'libcmark.so' + + if Config.library_path: + file = Config.library_path + '/' + file + + return file + + def get_cmark_library(self): + try: + library = cdll.LoadLibrary(self.get_filename()) + except OSError as e: + msg = str(e) + "(%s). To provide a path to libcmark use " \ + "Config.set_library_path() or " \ + "Config.set_library_file()." % self.get_filename() + raise LibcmarkError(msg) + + return library + + def function_exists(self, name): + try: + getattr(self.lib, name) + except AttributeError: + return False + + return True + +conf = Config() + +__alla__ = [ + 'Parser', + 'LibcmarkError', + 'NodeType', + 'ListType', + 'Node', + 'Document', + 'BlockQuote', + 'List', + 'Item', + 'CodeBlock', + 'HtmlBlock', + 'CustomBlock', + 'Paragraph', + 'Heading', + 'ThematicBreak', + 'Text', + 'SoftBreak', + 'LineBreak', + 'Code', + 'HtmlInline', + 'CustomInline', + 'Emph', + 'Strong', + 'Link', + 'Image', + 'ExtentType', + 'Extent', + 'SourceMap', + 'markdown_to_html', + 'parse_document', + 'Config', + 'conf' +] From 014b7812cbd1b671abd543fc623a15545869c7a8 Mon Sep 17 00:00:00 2001 From: Mathieu Duponchelle Date: Sun, 25 Dec 2016 18:03:21 +0100 Subject: [PATCH 5/5] cmark-format: Initial work on a formatting tool. The only implemented feature is reindenting to arbitrary width, but that was also the actually complicated one so there's that. --- src/CMakeLists.txt | 3 + src/cmark-format.in | 24 +++ src/remarkor.py | 462 ++++++++++++++++++++++++++++++++++++++++++ test/CMakeLists.txt | 7 + test/test_remarkor.py | 44 ++++ 5 files changed, 540 insertions(+) create mode 100755 src/cmark-format.in create mode 100644 src/remarkor.py create mode 100644 test/test_remarkor.py diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c7761ff3b..b9220af0f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,6 +56,9 @@ include_directories(. ${CMAKE_CURRENT_BINARY_DIR}) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmark_version.h.in ${CMAKE_CURRENT_BINARY_DIR}/cmark_version.h) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmark-format.in + ${CMAKE_CURRENT_BINARY_DIR}/cmark-format) + include (GenerateExportHeader) add_executable(${PROGRAM} ${PROGRAM_SOURCES}) diff --git a/src/cmark-format.in b/src/cmark-format.in new file mode 100755 index 000000000..b1e2f53ed --- /dev/null +++ b/src/cmark-format.in @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse + +HERE = "@CMAKE_CURRENT_SOURCE_DIR@" +sys.path.append(HERE) +sys.path.append(os.path.join(HERE, os.pardir, 'wrappers')) + +from remarkor import * +from wrapper import conf + +conf.set_library_path("@CMAKE_CURRENT_BINARY_DIR@") + +if __name__=='__main__': + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument('input') + arg_parser.add_argument('--width', type=int, default=80) + args = arg_parser.parse_args() + + remarkor = Remarkor.from_filename(args.input) + res = remarkor.remark(width=args.width) + sys.stdout.write(res) diff --git a/src/remarkor.py b/src/remarkor.py new file mode 100644 index 000000000..a2b7784ff --- /dev/null +++ b/src/remarkor.py @@ -0,0 +1,462 @@ +from wrapper import * +import re +from collections import defaultdict + +class RemarkorError(LibcmarkError): + pass + +def pretty_print_extents(source_map): + for extent in source_map: + print ('%d-%d %s for %s' % (extent.start, extent.stop, extent.type, type(extent.node))) + +ESCAPE_REGEX = re.compile('^' # Start of the string + '(' # The potential problematic pattern + '[' # Any of these characters + '#' # A heading + '|>' # Or a blockquote + '|*|+|-' # Or an unordered list start + ']' # End of single characters + '|[0-9]+[.|)]' # Ordered list start + ')' # End of the problematic pattern + '(' + '[ ]+.*' + '|$)' + ) + +ESCAPE_THEMATIC_REGEX = re.compile('^' # Start of the string + '((\*\s*){3,}|(\-\s*){3,}|(_\s*){3,})' # Either '*' or '-' or '_' 3 times or more, ws allowed + '$' # Nothing else is allowed + ) + +ESCAPE_CODE_BLOCK_REGEX = re.compile('^' # Start of the string + '(`{3,}|~{3,})' # Either '`' or `~` 3 times or more + '[^`]*' # Anything but '`' + '$' # Nothing else is allowed + ) + +ESCAPE_SETEXT_REGEX = re.compile('^' # Start of the string + '(\-+|=+)' # Either '-' or '=' one time or more + '[ ]*' # Optionally followed by 0 or more whitespace characters + '$' # Nothing else is allowed + ) + +ESCAPE_REFERENCE_DEF_REGEX = re.compile('^' # Start of the string + '\[' # Opening '[' + '.*' # Anything + '\]' # Closing ']' + ':' # Literal ':' + '.*' # Consume the remainder + ) + +def build_reverse_source_map(source_map): + rmap = defaultdict(list) + for ex in source_map: + rmap[ex.node].append(ex) + return rmap + +class Remarkor: + def __init__(self, contents): + self.dump_context = None + if type(contents) == str: + self.source = contents.encode('utf8') + else: + assert type(contents) == bytes + self.source = contents + + def remark(self, width=80, validate=True): + self.__reset(width) + + self.__dump(self.root_node, '') + self.need_cr = 1 + self.__flush('', '') + + res = '\n'.join(self.result) + + if validate: + self.__validate(res) + + return res + + @staticmethod + def from_filename(filename): + with open(filename, 'rb') as _: + contents = _.read() + return Remarkor(contents) + + def __reset(self, width): + self.parser = Parser(options=Parser.OPT_SOURCEPOS) + self.parser.feed(self.source) + self.root_node = self.parser.finish() + self.source_map = self.parser.get_source_map() + self.rmap = build_reverse_source_map(self.source_map) + + # List of lines + self.result = [''] + # Number of new lines to insert before flushing new content + self.need_cr = 0 + # Whether to insert 1 or 2 new lines before the next item + self.in_tight_list = False + # Workaround for indented lists, which are not reliably breakable by + # any block (in particular indented code) + # FIXME: Ask why this case is even part of the spec, because afaiu it's just broken + self.break_out_of_list = False + # Maximum number of columns + self.width = width + # Whether flush operations can break lines + self.flush_can_break = True + # The offset in the last line to check escape from + self.last_line_content_offset = 0 + # If we break the line when rendering this node, escape the last character + self.escape_link_if_breaking = None + # Do not try to escape anything + self.no_escape = False + # Do not try to escape html blocks, type link + self.no_escape_html_block = False + + def __normalize_texts(self, node): + if type(node) == Text: + node.literal = ' '.join(node.literal.split()) + if not node.literal: + node.unlink() + for c in node: + self.__normalize_texts(c) + + def __strip_blanks(self, node): + if type(node) == SoftBreak: + node.insert_after(Text(literal=' ')) + node.unlink() + return None + elif type(node) == HtmlBlock: + if node.literal.strip() == "": + node.unlink() + for c in node: + self.__strip_blanks(c) + + # This method compares the result with the original AST, stripping + # all blank nodes, all html end-list workaround blocks, and + # consolidating and normalizing text nodes. + def __validate(self, res): + parser = Parser() + parser.feed(res) + new_root_node = parser.finish() + + self.__strip_blanks(self.root_node) + self.__strip_blanks(new_root_node) + self.root_node.consolidate_text_nodes() + new_root_node.consolidate_text_nodes() + self.__normalize_texts(self.root_node) + self.__normalize_texts(new_root_node) + if self.root_node.to_xml() != new_root_node.to_xml(): + raise RemarkorError('Refactoring changed the AST !') + + def __utf8(self, start, stop): + return self.source[start:stop].decode('utf8') + + def __get_extent_utf8(self, extent): + if extent: + return self.__utf8(extent.start, extent.stop) + return '' + + def __get_closer_utf8(self, node): + for ex in reversed(self.rmap[node]): + if ex.type == ExtentType.CLOSER: + return self.__get_extent_utf8(ex) + return '' + return self.__get_extent_utf8(self.get_closer(node)) + + def __get_opener_utf8(self, node): + for ex in self.rmap[node]: + if ex.type == ExtentType.OPENER: + return self.__get_extent_utf8(ex) + return '' + + def __breakup_contents(self, node): + skip_next_ws = False + token = '' + extents = self.rmap[node] + + is_text = type(node) is Text + is_escaped = False + + if is_text: + while node.next: + node = node.next + if type(node) is not Text: + break + extents += self.rmap[node] + self.rmap[node] = [] + + def sanitize(token): + if is_text: + if type(node) is Link and re.match('.*\[.*\]$', token): + self.escape_link_if_breaking = node + return token + + for ex in extents: + if ex.type != ExtentType.CONTENT: + continue + for c in self.__utf8(ex.start, ex.stop): + if c == ' ' and not is_escaped: + if token: + yield token + token = '' + if not skip_next_ws: + yield ' ' + skip_next_ws = True + else: + token += c + skip_next_ws = False + if c == '\\': + is_escaped = not is_escaped + else: + is_escaped = False + if token: + yield sanitize(token) + + def __blankline(self): + self.need_cr = 2 + + def __cr(self): + self.need_cr = max(self.need_cr, 1) + + def __check_escape(self): + if self.no_escape: + self.no_escape = False + return + + prefix = self.result[-1][:self.last_line_content_offset] + unprefixed = self.result[-1][self.last_line_content_offset:] + m = re.match(ESCAPE_REGEX, unprefixed) + if (m): + try: + first_space = unprefixed.index(' ') + except ValueError: + first_space = len(unprefixed) + self.result[-1] = '%s%s\\%s' % (prefix, + unprefixed[0:first_space - 1], + unprefixed[first_space - 1:]) + return + + m = re.match(ESCAPE_THEMATIC_REGEX, unprefixed) + if (m): + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + return + + m = re.match(ESCAPE_CODE_BLOCK_REGEX, unprefixed) + if (m): + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + return + + m = re.match(ESCAPE_SETEXT_REGEX, unprefixed) + if (m): + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + return + + m = re.match(ESCAPE_REFERENCE_DEF_REGEX, unprefixed) + if (m): + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + return + + # FIXME: certainly very expensive, but as we make it so + # html inlines can never start a line, it is at least + # safe and correct + if not self.no_escape_html_block: + root_node = parse_document(unprefixed) + if type(root_node.first_child) in [HtmlBlock, Reference]: + self.result[-1] = '%s\\%s' % (prefix, unprefixed) + self.no_escape_html_block = False + + def __check_prefix(self, prefix): + if not self.result[-1]: + self.result[-1] = prefix + self.last_line_content_offset = len(prefix) + + def __flush(self, prefix, utf8, escape_if_breaking=0): + if self.in_tight_list: + self.need_cr = min(self.need_cr, 1) + + while self.need_cr: + self.__check_prefix(prefix) + self.__check_escape() + self.result.append('') + self.need_cr -= 1 + + self.__check_prefix(prefix) + + if (utf8 and + self.flush_can_break and + len(self.result[-1]) > self.last_line_content_offset and + len(self.result[-1]) + len(utf8) >= self.width): + self.result[-1] = self.result[-1].rstrip(' ') + self.__check_escape() + if escape_if_breaking: + self.result[-1] = "%s\\%s" % (self.result[-1][:escape_if_breaking], + self.result[-1][escape_if_breaking:]) + self.result.append('') + self.__check_prefix(prefix) + if utf8 == ' ': + return + + self.result[-1] += utf8 + + def __dump(self, node, prefix): + old_in_tight_list = self.in_tight_list + old_break_out_of_list = self.break_out_of_list + old_flush_can_break = self.flush_can_break + + opener_utf8 = self.__get_opener_utf8(node).strip() + + if type(node) is BlockQuote: + self.__flush(prefix, opener_utf8 + ' ') + self.last_line_content_offset = len(opener_utf8 + ' ' + prefix) + prefix += opener_utf8 + ' ' + elif type(node) is Heading: + self.flush_can_break = False + if (opener_utf8): + self.__flush(prefix, opener_utf8 + ' ') + self.no_escape = True + elif type(node) is Item: + opener_utf8_with_blank = '' + last_stop = -1 + # Very awkward, see list item indentation tests + for ex in self.rmap[node]: + if last_stop != -1 and ex.start != last_stop: + break + last_stop = ex.stop + opener_utf8_with_blank += self.__get_extent_utf8(ex) + if len(opener_utf8_with_blank) > 4: + self.break_out_of_list = True + + self.__flush(prefix, opener_utf8 + ' ') + self.last_line_content_offset = len(opener_utf8 + ' ' + prefix) + # Only setting here to make sure the call to flush was made with + # the right tightness. + self.in_tight_list = node.parent.tight + prefix += (len(opener_utf8) + 1) * ' ' + elif type(node) in [CodeBlock, HtmlBlock]: + self.flush_can_break = False + for ex in self.rmap[node]: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8.rstrip('\r\n')) + self.no_escape = True + # Make sure to prefix the next line + if utf8.endswith('\n'): + self.__cr() + self.flush_can_break = old_flush_can_break + self.__blankline() + elif type(node) is ThematicBreak: + self.flush_can_break = False + utf8 = ' '.join(self.__breakup_contents(node)).rstrip('\r\n') + self.__flush(prefix, utf8) + self.no_escape = True + # Make sure to prefix the next line + if utf8.endswith('\n'): + self.__cr() + self.flush_can_break = old_flush_can_break + self.__blankline() + elif type(node) is Reference: + self.flush_can_break = False + for ex in self.rmap[node]: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8) + if ex.type == ExtentType.OPENER: + self.no_escape = True + self.flush_can_break = old_flush_can_break + if type(node.next) is Reference: # Keep reference lists tight + self.__cr() + else: + self.__blankline() + elif type(node) is Text: + for word in self.__breakup_contents(node): + self.__flush(prefix, word) + elif type(node) is SoftBreak: + self.__flush(prefix, ' ') + elif type(node) is LineBreak: + self.flush_can_break = False + content = ''.join([self.__get_extent_utf8(ex).rstrip('\r\n') for ex in self.rmap[node]]) + # Keep the source hardbreak style + if '\\' in content: + self.__flush(prefix, content) + else: + self.__flush(prefix, ' ') + self.flush_can_break = old_flush_can_break + self.__cr() + elif type(node) in [Emph, Strong]: + self.__flush(prefix, opener_utf8) + if self.result[-1] == prefix + opener_utf8: + self.no_escape = True + self.flush_can_break = False + elif type(node) in [Link, Image]: + if self.escape_link_if_breaking == node: + self.__flush(prefix, opener_utf8, escape_if_breaking=-1) + else: + self.__flush(prefix, opener_utf8) + if self.result[-1] == prefix + opener_utf8: + self.no_escape = True + elif type(node) is HtmlInline: + self.flush_can_break = False + for ex in self.rmap[node]: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8.rstrip('\r\n')) + if utf8.endswith('\n'): + self.__cr() + self.flush_can_break = old_flush_can_break + elif type(node) is Code: + for ex in self.rmap[node]: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8.rstrip('\r\n')) + if utf8.endswith('\n'): + self.__cr() + + for child in node: + tmp_flush_can_break = self.flush_can_break + tmp_node = child + + # See __breakup_contents + while type(tmp_node) is Text and type(tmp_node.next) is Text: + tmp_node = tmp_node.next + + if type(tmp_node.next) is HtmlInline or type(tmp_node.previous) is HtmlInline: + self.flush_can_break = False + self.__dump(child, prefix) + self.flush_can_break = tmp_flush_can_break + + if type(node) in [Emph, Strong]: + self.__flush(prefix, self.__get_closer_utf8(node).rstrip('\r\n')) + self.flush_can_break = old_flush_can_break + elif type(node) is List: + self.in_tight_list = old_in_tight_list + if self.break_out_of_list: + self.__cr() + self.__flush(prefix, "") + self.no_escape = True + self.__cr() + self.break_out_of_list = old_break_out_of_list + elif type(node) is Heading: + for ex in self.rmap[node]: + if ex.type != ExtentType.OPENER: + utf8 = self.__get_extent_utf8(ex) + self.__flush(prefix, utf8.rstrip('\r\n')) + self.no_escape = True + if utf8.endswith('\n'): + self.__cr() + self.flush_can_break = old_flush_can_break + elif type(node) in [Link, Image]: + for ex in self.rmap[node]: + if ex.type != ExtentType.OPENER: + self.flush_can_break = old_flush_can_break + utf8 = self.__get_extent_utf8(ex).strip(' \r\n') + if ex.type == ExtentType.PUNCTUATION and prev_extent.type == ExtentType.PUNCTUATION: + self.flush_can_break = False + elif ex.type == ExtentType.LINK_TITLE: + self.__flush(prefix, ' ') + if ex.type != ExtentType.BLANK: + self.__flush(prefix, utf8) + if ex.type == ExtentType.LINK_DESTINATION: + if self.result[-1] == prefix + utf8 and re.match('^<.*>$', utf8): + self.no_escape_html_block = True + prev_extent = ex + self.flush_can_break = old_flush_can_break + + if type(node) in [Paragraph, List, BlockQuote, Item, Heading, Document]: + self.__blankline() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 32497fb0a..5dfd2c2f5 100755 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -90,6 +90,13 @@ IF (PYTHONINTERP_FOUND) "${CMAKE_CURRENT_SOURCE_DIR}/test_cmark.py" "${CMAKE_CURRENT_BINARY_DIR}/../src" ) + + add_test(remarkor + ${PYTHON_EXECUTABLE} + "${CMAKE_CURRENT_SOURCE_DIR}/test_remarkor.py" + "${CMAKE_CURRENT_BINARY_DIR}/../src" + "${CMAKE_CURRENT_SOURCE_DIR}/spec.txt" + ) ELSE(PYTHONINTERP_FOUND) message("\n*** A python 3 interpreter is required to run the python binding tests.\n") add_test(skipping_python_binding_tests diff --git a/test/test_remarkor.py b/test/test_remarkor.py new file mode 100644 index 000000000..f4d495c68 --- /dev/null +++ b/test/test_remarkor.py @@ -0,0 +1,44 @@ +import unittest +import argparse +import os +import sys + +from spec_tests import get_tests + +here = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(here) +sys.path.append(os.path.join(here, os.pardir, 'src')) +sys.path.append(os.path.join(here, os.pardir, 'wrappers')) + +from remarkor import * + +if __name__=='__main__': + parser = argparse.ArgumentParser() + parser.add_argument('libdir') + parser.add_argument('specpath') + args = parser.parse_known_args() + conf.set_library_path(args[0].libdir) + SPEC_PATH = args[0].specpath + +class TestRemarkorMeta(type): + def __new__(mcs, name, bases, dict): + def gen_test(test_description): + def test(self): + remarkor = Remarkor(test_description['markdown']) + remarkor.remark(width=1, validate=True) + return test + + for t in get_tests(SPEC_PATH): + test_name = 'test_%s' % re.sub('\W|^(?=\d)','_', t['section']) + cnt = 1 + while '%s_%d' % (test_name, cnt) in dict: + cnt += 1 + test_name = '%s_%d' % (test_name, cnt) + dict[test_name] = gen_test(t) + return type.__new__(mcs, name, bases, dict) + +class TestRemarkor(unittest.TestCase, metaclass=TestRemarkorMeta): + pass + +if __name__=='__main__': + unittest.main(argv=[sys.argv[0]] + args[1])