-*- mode: org -*-
#+TITLE:       spine (doc_reform) regex defaults
#+DESCRIPTION: documents - structuring, publishing in multiple formats & search
#+FILETAGS:    :spine:regex:
#+AUTHOR:      Ralph Amissah
#+EMAIL:       [[mailto:ralph.amissah@gmail.com][ralph.amissah@gmail.com]]
#+COPYRIGHT:   Copyright (C) 2015 - 2022 Ralph Amissah
#+LANGUAGE:    en
#+STARTUP:     content hideblocks hidestars noindent entitiespretty
#+PROPERTY:    header-args  :exports code
#+PROPERTY:    header-args+ :noweb yes
#+PROPERTY:    header-args+ :results no
#+PROPERTY:    header-args+ :cache no
#+PROPERTY:    header-args+ :padline no
#+PROPERTY:    header-args+ :mkdirp yes
#+OPTIONS:     H:3 num:nil toc:t \n:t ::t |:t ^:nil -:t f:t *:t

- [[./doc-reform.org][doc-reform.org]]  [[./][org/]]

* meta ctRegex

- [[./doc-reform.org][doc-reform.org]]  [[./][org/]]

https://dlang.org/phobos/std_regex.html
- Plain string, in which case it's compiled to bytecode before matching.
- Regex!char (wchar/dchar) that contains a pattern in the form of compiled bytecode.
- StaticRegex!char (wchar/dchar) that contains a pattern in the form of compiled native machine code.

22 special characters used:

#+BEGIN_SRC txt
【】〖〗┥┝┤├¤░┘┙┚┼┿╂┊┏┚┆■☼
#+END_SRC

** _module template_ :module:

#+HEADER: :tangle "../src/doc_reform/meta/rgx.d"
#+HEADER: :noweb yes
#+BEGIN_SRC d
<<doc_header_including_copyright_and_license>>
/++
  regex: regular expressions used in sisu document parser
+/
module doc_reform.meta.rgx;
static template spineRgxIn() {
  static struct RgxI {
    <<meta_rgx_misc>>
    <<meta_rgx_comments>>
    <<meta_rgx_config>>
    <<meta_rgx_headers>>
    <<meta_rgx_heading_marks>>
    <<meta_rgx_paragraph_marks>>
    <<meta_rgx_blocks>>
    <<meta_rgx_block_tic>>
    <<meta_rgx_block_curly>>
    <<meta_rgx_sub_match_code>>
    <<meta_rgx_table>>
    <<meta_rgx_footnote_endnote>>
    <<meta_rgx_url>>
    <<meta_rgx_images>>
    <<meta_rgx_book_index>>
    <<meta_rgx_heading_number>>
    <<meta_rgx_object_number_off_object>>
    <<meta_rgx_object_number_off_block>>
    <<meta_rgx_code_block>>
    <<meta_rgx_line_and_page_breaks>>
    <<meta_rgx_bibliography>>
    <<meta_rgx_book_index_split>>
    <<meta_rgx_topic_register_split>>
    <<meta_rgx_language_codes>>
    <<prgmkup_rgx_spaces>>
    <<prgmkup_rgx_filename_and_path>>
    <<prgmkup_rgx_inline_breaks>>
    <<prgmkup_rgx_internal_footnotes_and_endnotes>>
    <<prgmkup_rgx_inline_links>>
    <<prgmkup_rgx_font_face>>
    <<prgmkup_rgx_font_face_line>>
  }
}
#+END_SRC

** misc :misc:

#+NAME: meta_rgx_misc
#+BEGIN_SRC d
/+ misc +/
static flag_action                                    = ctRegex!(`^(--[a-z][a-z0-9-]+)$`);
static within_quotes                                  = ctRegex!(`"(.+?)"`, "m");
static yaml_tag_is_str                                = ctRegex!(`:str$`);
static yaml_tag_is_int                                = ctRegex!(`:int$`);
static yaml_tag_is_map                                = ctRegex!(`:map$`);
static yaml_tag_is_seq                                = ctRegex!(`:seq$`);
static make_heading_delimiter                         = ctRegex!(`[;][ ]*`);
static arr_delimiter                                  = ctRegex!(`[ ]*[;][ ]*`);
static name_delimiter                                 = ctRegex!(`^([^,]+)[ ]*,[ ]+(.+?)$`);
static book_index_go                                  = ctRegex!("(?P<link>(?P<ocn>[0-9]+)(?:-[0-9]+)?)");
static trailing_comma                                 = ctRegex!(",[ ]*$");
static trailing_linebreak                             = ctRegex!(",[ ]{1,2}\\\\\\\\\n[ ]{4}$","m");
static newline_eol_strip_preceding                    = ctRegex!("[ ]*\n");
static newline_eol_delimiter_only                     = ctRegex!("^\n");
static markup_inline_linebreak                        = ctRegex!(`\s*\\\\s*`, "m");
static para_delimiter                                 = ctRegex!("\n[ ]*\n+");
static table_col_delimiter                            = ctRegex!("[ ]*\n+", "mg");
static table_row_delimiter                            = ctRegex!("\n[ ]*\n+", "mg");
static table_row_delimiter_special                    = ctRegex!("[ ]*\n", "mg");
static table_col_delimiter_special                    = ctRegex!("[ ]*[|][ ]*", "mg");
static levels_numbered                                = ctRegex!(`^[0-9]$`);
static levels_numbered_headings                       = ctRegex!(`^[0-7]$`);
static numeric_col                                    = ctRegex!(`^[ 0-9,.%$£₤Є€€¥()-]+$`);
#+END_SRC

#+BEGIN_SRC d
// static true_dollar                                    = ctRegex!(`\$`, "gm");
// static sep                                            = ctRegex!(`␣`, "gm");
// static uid_sep                                        = ctRegex!(`:`, "gm"); // ctRegex!(`␣`, "gm");
// static book_index_go_scroll                           = ctRegex!("(?P<link>(?P<ocn>[0-9]+)(?:-[0-9]+)?)");
// static book_index_go_seg                              = ctRegex!("(?P<link>(?P<ocn>[0-9]+)(?:-[0-9]+)?):(?P<seg>[a-z0-9_-]+)");
// static book_index_go_seg_                             = ctRegex!("(?P<link>(?P<ocn>[0-9]+)(?:-[0-9]+)?)(:(?P<seg>[a-z0-9_-]+))?");
// static book_index_go_seg_anchorless                   = ctRegex!("(?P<link>(?P<ocn>[0-9]+)(?:-[0-9]+)?)");
// static numeric                                        = ctRegex!(`[ 0-9,.-]+`);
#+END_SRC

** comments :comment:

#+NAME: meta_rgx_comments
#+BEGIN_SRC d
/+ comments +/
static comment                                        = ctRegex!(`^%+ `);
#+END_SRC

** config

#+NAME: meta_rgx_config
#+BEGIN_SRC d
/+ header +/
#+END_SRC

** native headers
*** native header :native:header:

#+NAME: meta_rgx_headers
#+BEGIN_SRC d
/+ header +/
static variable_doc_title_author_date           = ctRegex!(`@title-author-date`);
static variable_doc_title_author                = ctRegex!(`@title-author`);
static variable_doc_title                       = ctRegex!(`@title`);
static variable_doc_author                      = ctRegex!(`@author|@creator`);
static variable_doc_date                        = ctRegex!(`@date`);
static raw_author_munge                         = ctRegex!(`(?P<last>\S.+?),\s+(?P<first>.+)`,"i");
static yaml_config                              = ctRegex!(`^[a-z]+\s*:\s*(?:"?\w|$)`, "m");
#+END_SRC

** heading operators :heading:operator:

#+NAME: meta_rgx_heading_marks
#+BEGIN_SRC d
/+ heading operators +/
static heading_a                                = ctRegex!(`^:?[A][~] `, "m");
static heading                                  = ctRegex!(`^:?([A-D1-4])[~]([a-z0-9_.-]*[?]?)\s+`,"i");
static headings                                 = ctRegex!(`^:?(?P<level>[A-D1-4])[~](?:[a-z0-9_.-]*[?]?|[!](?:glossary|bibliogrphy|biblio|references?|blurb))(?:\s|$)`,"i");
static heading_seg_and_above                    = ctRegex!(`^:?([A-D1])[~]([a-z0-9_.-]*[?]?)\s+`,"i");
static heading_anchor_tag                       = ctRegex!(`^:?[A-D1-4][~](?P<anchor>[a-z0-9_.-]+) `,"i");
static heading_identify_anchor_tag              = ctRegex!(`^:?[A-D1-4][~]\s+(?:(?:(?:chapter|article|section|clause)\s+[0-9.]+)|(?:[0-9]+))`,"i");
static heading_extract_named_anchor_tag         = ctRegex!(`^:?[A-D1-4][~]\s+(chapter|article|section|clause)\s+((?:[0-9]+[.:])*[0-9]+)(?=[.:;, ]|$)`,"i");
static heading_extract_unnamed_anchor_tag       = ctRegex!(`^:?[A-D1-4][~]\s+((?:[0-9]+.)*[0-9]+)(?=[.:;, ]|$)`);
static heading_marker_missing_tag               = ctRegex!(`^:?([A-D1-4])[~] `);
static heading_anchor_tag_plus_colon            = ctRegex!(`^:?([A-D1-4][~])([a-z0-9_.:-]+) `,"i");
static heading_marker_tag_has_colon             = ctRegex!(`([:])`);
static heading_biblio                           = ctRegex!(`^1[~][!](biblio(?:graphy)?|references?)`);
static heading_glossary                         = ctRegex!(`^1[~][!](glossary)`);
static heading_blurb                            = ctRegex!(`^1[~][!](blurb)`);
#+END_SRC

#+BEGIN_SRC d
// static heading_marker                                 = ctRegex!(`^:?([A-D1-4])[~]`);
#+END_SRC

** paragraph operators :paragraph:operator:

#+NAME: meta_rgx_paragraph_marks
#+BEGIN_SRC d
/+ paragraph operators +/
static para_bullet                              = ctRegex!(`^_[*] `);
static para_bullet_indent                       = ctRegex!(`^_(?P<indent>[1-9])[*] `);
static para_indent                              = ctRegex!(`^_(?P<indent>[1-9])[ ]`);
static para_indent_hang                         = ctRegex!(`^_(?P<hang>[0-9])_(?P<indent>[0-9])[ ]`);
static para_attribs                             = ctRegex!(`^_(?:(?:[0-9])(?:_([0-9]))?|(?:[1-9])?[*]) `);
static para_inline_link_anchor                  = ctRegex!(`\*[~](?P<anchor>[a-z0-9_.-]+)(?= |$)`,"i");
#+END_SRC

#+NAME: grouped_text_rgx_paragraph_marks
#+BEGIN_SRC d
/+ paragraph operators +/
static grouped_para_indent_1                    = ctRegex!(`^_1[ ]`, "m");
static grouped_para_indent_2                    = ctRegex!(`^_2[ ]`, "m");
static grouped_para_indent_3                    = ctRegex!(`^_3[ ]`, "m");
static grouped_para_indent_4                    = ctRegex!(`^_4[ ]`, "m");
static grouped_para_indent_5                    = ctRegex!(`^_5[ ]`, "m");
static grouped_para_indent_6                    = ctRegex!(`^_6[ ]`, "m");
static grouped_para_indent_7                    = ctRegex!(`^_7[ ]`, "m");
static grouped_para_indent_8                    = ctRegex!(`^_8[ ]`, "m");
static grouped_para_indent_9                    = ctRegex!(`^_9[ ]`, "m");
static grouped_para_bullet                      = ctRegex!(`^_[*] `, "m");
static grouped_para_bullet_indent_1             = ctRegex!(`^_1[*] `, "m");
static grouped_para_bullet_indent_2             = ctRegex!(`^_2[*] `, "m");
static grouped_para_bullet_indent_3             = ctRegex!(`^_3[*] `, "m");
static grouped_para_bullet_indent_4             = ctRegex!(`^_4[*] `, "m");
static grouped_para_bullet_indent_5             = ctRegex!(`^_5[*] `, "m");
static grouped_para_bullet_indent_6             = ctRegex!(`^_6[*] `, "m");
static grouped_para_bullet_indent_7             = ctRegex!(`^_7[*] `, "m");
static grouped_para_bullet_indent_8             = ctRegex!(`^_8[*] `, "m");
static grouped_para_bullet_indent_9             = ctRegex!(`^_9[*] `, "m");
static grouped_para_bullet_indent               = ctRegex!(`^_(?P<indent>[1-9])[*] `, "m");
static grouped_para_indent_hang                 = ctRegex!(`^_(?P<hang>[0-9])_(?P<indent>[0-9])[ ]`, "m");
#+END_SRC

#+BEGIN_SRC d
// static grouped_para_indent                                    = ctRegex!(`^_(?P<indent>[1-9])[ ]`, "m");
#+END_SRC

** blocked markup
*** blocked markup curly & tic :block:

#+NAME: meta_rgx_blocks
#+BEGIN_SRC d
/+ blocked markup +/
static block_open                               = ctRegex!("^((code(?:[.][a-z][0-9a-z#+_]+)?|(?:poem|group|block|quote)(?:[.][a-z][0-9a-z_]+)?|table)(?:[(][ a-zA-Z0-9;:,]*[)])?[{][ ]*$)|^`{3} (code(?:[.][a-z][0-9a-z#+_]+)?|(?:poem|group|block|quote)(?:[.][a-z][0-9a-z_]+)?|table)(?:[(][ a-zA-Z0-9;:,]*[)])?|^[{]table[(](?:h;)?(?P<columns>(?:[ ,]+[0-9]+)+)[)][}]");
static block_poem_open                          = ctRegex!("^((poem(?:[(][ a-zA-Z0-9;:,]*[)])?[{][ ]*$)|`{3} poem(?:[(][ a-zA-Z0-9;:,]*[)])?)");
#+END_SRC

*** blocked markup tic :block:tic:

#+NAME: meta_rgx_block_tic
#+BEGIN_SRC d
/+ blocked markup tics +/
static block_tic_code_open                      = ctRegex!("^`{3} code(?:[.](?P<syntax>[a-z][0-9a-z#+_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?");
static block_tic_poem_open                      = ctRegex!("^`{3} poem(?:[.](?P<lang>[a-z][0-9a-z_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?");
static block_tic_group_open                     = ctRegex!("^`{3} group(?:[.](?P<lang>[a-z][0-9a-z_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?");
static block_tic_block_open                     = ctRegex!("^`{3} block(?:[.](?P<lang>[a-z][0-9a-z_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?");
static block_tic_quote_open                     = ctRegex!("^`{3} quote(?:[.](?P<lang>[a-z][0-9a-z_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?");
static block_tic_table_open                     = ctRegex!("^`{3} table(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?"); // ctRegex!("^`{3} table(?:\(.*?\))?");
static block_tic_close                          = ctRegex!("^(`{3})$","m");
#+END_SRC

*** blocked markup curly :block:curly:

#+NAME: meta_rgx_block_curly
#+BEGIN_SRC d
/+ blocked markup curly +/
static block_curly_code_open                    = ctRegex!(`^(?:code(?:[.](?P<syntax>[a-z][0-9a-z_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?[{][ ]*$)`);
static block_curly_code_close                   = ctRegex!(`^([}]code)`);
static block_curly_poem_open                    = ctRegex!(`^(poem(?:[.](?P<lang>[a-z][0-9a-z_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?[{][ ]*$)`);
static block_curly_poem_close                   = ctRegex!(`^([}]poem)`);
static block_curly_group_open                   = ctRegex!(`^(group(?:[.](?P<lang>[a-z][0-9a-z_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?[{][ ]*$)`);
static block_curly_group_close                  = ctRegex!(`^([}]group)`);
static block_curly_block_open                   = ctRegex!(`^(block(?:[.](?P<lang>[a-z][0-9a-z_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?[{][ ]*$)`);
static block_curly_block_close                  = ctRegex!(`^([}]block)`);
static block_curly_quote_open                   = ctRegex!(`^(quote(?:[.](?P<lang>[a-z][0-9a-z_]+))?(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?[{][ ]*$)`);
static block_curly_quote_close                  = ctRegex!(`^([}]quote)`);
static block_curly_table_open                   = ctRegex!(`^table(?:[(](?P<attrib>[ a-zA-Z0-9;:,]*)[)])?[{][ ]*$`);
static block_curly_table_close                  = ctRegex!(`^([}]table)`);
static block_curly_table_special_markup         = ctRegex!(`^[{]table[(](?P<attrib>(?:(h);)?(?P<columns>(?:[, ]+[0-9]+)+))[)][}]`, "mg");
#+END_SRC

*** block sub-matches :block:
**** code

#+NAME: meta_rgx_sub_match_code
#+BEGIN_SRC d
static code_numbering                           = ctRegex!(`(?P<number>\blinenumber\b|\bnumber\b|\blnr\b)`);
#+END_SRC

**** table

#+NAME: meta_rgx_table
#+BEGIN_SRC d
static table_head_instructions                  = ctRegex!(`(?:(?P<c_heading>h);)?(?:[ ]+c(?P<c_num>[0-9]):)?(?P<c_widths>(?:[, ]+[0-9]+[lr]?)+)`);
static table_col_widths_and_alignment           = ctRegex!(`(?P<width>[0-9]+)(?P<align>[lr]?)`);
static table_col_widths                         = ctRegex!(`(?P<widths>[0-9]+)`);
static table_col_align_match                    = ctRegex!(`(?P<align>[lr])`);
static table_col_separator_nl                   = ctRegex!(`[┊]$`, "mg");
#+END_SRC

#+BEGIN_SRC d
// static table_col_align                                = ctRegex!(`(?P<align>[lr]?)`);
// static table_col_separator                            = ctRegex!(`┊`);
#+END_SRC

** inline markup :inline:footnote:
*** footnotes & endnotes

#+NAME: meta_rgx_footnote_endnote
#+BEGIN_SRC d
/+ inline markup footnotes endnotes +/
static inline_notes_curly_gen                   = ctRegex!(`~\{.+?\}~`, "m");
static inline_notes_curly                       = ctRegex!(`~\{\s*(.+?)\}~`, "mg");
static inline_notes_curly_sp_asterisk           = ctRegex!(`~\{[*]+\s+(.+?)\}~`, "m");
static inline_notes_curly_sp_plus               = ctRegex!(`~\{[+]+\s+(.+?)\}~`, "m");
static note_ref                                 = ctRegex!(`^\S+?noteref_(?P<ref>[0-9]+)`, "mg");     // {^{73.}^}#noteref_73
#+END_SRC

#+BEGIN_SRC d
// static inline_notes_curly_sp                          = ctRegex!(`~\{[*+]+\s+(.+?)\}~`, "m");
// static inline_note_curly_delimiters                   = ctRegex!(`(~\{[*+]?\s*)(.+?)(\}~)`, "mg");
// static inline_notes_square                            = ctRegex!(`~\[\s*(.+?)\]~`, "mg");
// static inline_text_and_note_square_sp                 = ctRegex!(`(.+?)~\[[*+]+\s+(.+?)\]~`, "mg");
// static inline_text_and_note_square                    = ctRegex!(`(.+?)~\[\s*(.+?)\]~`, "mg");
// static inline_note_square_delimiters                  = ctRegex!(`(~\[\s*)(.+?)(\]~)`, "mg");
#+END_SRC

*** links/ urls :inline:footnote:

#+NAME: meta_rgx_url
#+BEGIN_SRC d
static smid_inline_url_generic                        = ctRegex!(`(?:^|[}(\[ ])(?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤|#)[a-zA-Z0-9_#]`, "mg");
static smid_inline_url                                = ctRegex!(`((?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤|#)[a-zA-Z0-9_]\S*)`, "mg");
static smid_inline_link_naked_url                     = ctRegex!(`(?P<pre>^|[ (\[])(?P<link>(?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤)\S+?)(?=[.,;:?!'"]?([ )\]]|$))`, "mg");
static smid_inline_link_markup_regular                = ctRegex!(`(?P<pre>^|[ (\[])\{\s*(?P<content>.+?)\s*\}(?P<link>(?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤|#)\S+?)(?=[;:!,?.]?([ )\]]|$))`, "mg");
static smid_inline_link_endnote_url_helper_punctuated = ctRegex!(`\{~\^\s+(?P<content>.+?)\}(?P<link>(?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤|#)\S+?)(?=[.,;:?!]?([ ]|$))`, "mg");
static smid_inline_link_endnote_url_helper            = ctRegex!(`\{~\^\s+(?P<content>.+?)\}(?P<link>(?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤|#)\S+)`, "mg");
#+END_SRC

#+BEGIN_SRC d
// static webserv_url_doc_root                           = ctRegex!(`(?P<url>(?P<domain>https?:\/\/[^ /]+)\/(?P<path>\S*))`, "mg");
#+END_SRC

*** images :images:

#+NAME: meta_rgx_images
#+BEGIN_SRC d
static image                                    = ctRegex!(`([a-zA-Z0-9._-]+?\.(?:png|gif|jpg))`, "mg");
static smid_image                               = ctRegex!(`(?P<pre>(?:^|[ ])[{┥](?:~\^\s+|\s*))(?P<image>[a-zA-Z0-9._-]+?\.(?:png|gif|jpg))(?P<post>(?:.*?)\s*[}┝](?:image|┤.*?├|(?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤|#)\S+?)(?=[;:!,?.]?([ )\]]|$)))`, "mg");
static smid_image_generic                       = ctRegex!(`(?:^|[ ])[{┥](?:~\^\s+|\s*)\S+\.(?:png|gif|jpg).*?[}┝](?:image|┤.*?├|(?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤|#)\S+?)(?=[;:!,?.]?([ )\]]|$))`, "mg");
static smid_image_with_dimensions               = ctRegex!(`(?P<pre>(?:^|[ ])[{┥](?:~\^\s+|\s*))(?P<image>[a-zA-Z0-9._-]+?\.(?:png|gif|jpg))\s+(?P<width>\d+)x(?P<height>\d+)\s*(?P<post>(?:.*?)\s*[}┝](?:image|┤.*?├|(?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤|#)\S+?)(?=[;:!,?.]?([ )\]]|$)))`, "mg");
static smid_mod_image_without_dimensions        = ctRegex!(`[{┥](?:~\^\s+|\s*)☼\S+\.(?:png|gif|jpg),w0h0.*[}┝](?:image|┤.*?├|(?:https?|git):\/\/\S+?)(?=[;:!,?.]?([ )\]]|$))`, "mg");
static smid_image_delimit                       = ctRegex!(`(?P<pre>^|[ ])\{\s*(?P<text>.+?)\s*\}(?:image)(?=[;:!,?.]?([ )\]]|$))`, "mg");
#+END_SRC

#+BEGIN_SRC d
// static smid_a_image                                    = ctRegex!(`(?P<pre>(?:^|[ ])[{](?:~\^\s+|\s*))(?P<image>[a-zA-Z0-9._-]+?\.(?:png|gif|jpg))(?P<post>(?:.*?)\s*[}](?:image|(?:(?:https?|git):\/\/|¤?\.\.\/|¤?\.\/|¤|#)\S+?)(?=[;:!,?.]?([ )\]]|$)))`, "mg");
#+END_SRC

*** inline markup book index :inline:bookindex:

#+NAME: meta_rgx_book_index
#+BEGIN_SRC d
/+ inline markup book index +/
static book_index_item                          = ctRegex!(`^=\{\s*(?P<bookindex>.+?)\}$`, "m");
static book_index_item_open                     = ctRegex!(`^=\{\s*([^}]*?)$`);
static book_index_item_close                    = ctRegex!(`^(.*?)\}$`, "m");
#+END_SRC

** switch
*** switch off auto-heading number

#+NAME: meta_rgx_heading_number
#+BEGIN_SRC d
static auto_heading_numbering_lv1               = ctRegex!(`^1~`, "m");
static auto_heading_numbering_off_lv1           = ctRegex!(`^1~\S*?-\s`, "m");
static auto_heading_numbering_off_lv2           = ctRegex!(`^2~\S*?-\s`, "m");
static auto_heading_numbering_off_lv3           = ctRegex!(`^3~\S*?-\s`, "m");
static auto_heading_numbering_off_lv4           = ctRegex!(`^4~\S*?-\s`, "m");
#+END_SRC

#+BEGIN_SRC d
// static auto_heading_numbering_lv2                    = ctRegex!(`^2~`, "m"); // bug?
// static auto_heading_numbering_lv3                    = ctRegex!(`^3~`, "m"); // bug?
// static auto_heading_numbering_lv4                    = ctRegex!(`^4~`, "m"); // bug?
// static auto_heading_numbering_off                    = ctRegex!(`^[A-D1-4]~\S*?-\s`, "m");
#+END_SRC

** no object_number object :ocn:off:object:

#+NAME: meta_rgx_object_number_off_object
#+BEGIN_SRC d
/+ no object_number object +/
static object_number_off                        = ctRegex!(`~#[ ]*$`, "m");
static object_number_off_dummy_heading          = ctRegex!(`-#$`, "m");
static object_number_off_all                    = ctRegex!(`[~-]#$`, "m");
static repeated_character_line_separator        = ctRegex!(`^(?:[ ]*(?:(?:[.][ ]*){4,}|(?:[-][ ]*|[~][ ]*|[*][ ]*|[$][ ]*|[#][ ]*|[\\][ ]*|[/][ ]*){2,})\s*?)+$`);
#+END_SRC

** no object_number block :ocn:off:block:

#+NAME: meta_rgx_object_number_off_block
#+BEGIN_SRC d
/+ no object_number block +/
static object_number_off_block                  = ctRegex!(`^--~#$`);
static object_number_off_block_dummy_heading    = ctRegex!(`^---#$`);
static object_number_off_block_close            = ctRegex!(`^--\+#$`);
static object_number_block_marks                = ctRegex!(`^--[+~-]#$`);
#+END_SRC

** ignore outside code blocks :block:code:

#+NAME: meta_rgx_code_block
#+BEGIN_SRC d
/+ ignore outside code blocks +/
static skip_from_regular_parse                  = ctRegex!(`^(--[+~-]#|-[\\]{2}-|=[.\\]{2}=)$`);
#+END_SRC

** line & page breaks :break:

#+NAME: meta_rgx_line_and_page_breaks
#+BEGIN_SRC d
/+ line & page breaks +/
static break_string                             = ctRegex!(`』`);
#+END_SRC

** biblio tags :biblio:tags:

#+NAME: meta_rgx_bibliography
#+BEGIN_SRC d
/+ biblio tags +/
static biblio_tags                              = ctRegex!(`^(is|au|author_raw|author|author_arr|editor_raw|ed|editor_arr|ti|title|subtitle|fulltitle|lng|language|trans|src|jo|journal|in|vol|volume|edn|edition|yr|year|pl|place|pb|pub|publisher|url|pg|pages|note|short_name|id):\s+(.+)`);
static biblio_abbreviations                     = ctRegex!(`^(au|ed|ti|lng|jo|vol|edn|yr|pl|pb|pub|pg|pgs|sn)$`);
#+END_SRC

** bookindex split :bookindex:split:

#+NAME: meta_rgx_book_index_split
#+BEGIN_SRC d
/+ bookindex split +/
static bi_main_terms_split                            = ctRegex!(`\s*;\s*`);
static bi_main_term_plus_rest_split                   = ctRegex!(`\s*:\s*`);
static bi_sub_terms_plus_object_number_offset_split   = ctRegex!(`\s*\|\s*`);
static bi_term_and_object_numbers_match               = ctRegex!(`^(.+?)\+(\d+)`);
#+END_SRC

** topic register split (document classify)

#+NAME: meta_rgx_topic_register_split
#+BEGIN_SRC d
static topic_register_main_terms_split          = ctRegex!(`\s*;\s*`);
static topic_register_main_term_plus_rest_split = ctRegex!(`\s*:\s*`);
static topic_register_sub_terms_split           = ctRegex!(`\s*\|\s*`);
static topic_register_multiple_sub_terms_split  = ctRegex!(`␣([^|␣]+(?:\|[^|␣]+)+)`);
#+END_SRC

** language codes :language:codes:

#+NAME: meta_rgx_language_codes
#+BEGIN_SRC d
/+ language codes +/
auto language_code_and_filename                                    =
   ctRegex!("(?:^|[/])(am|bg|bn|br|ca|cs|cy|da|de|el|en|eo|es|et|eu|fi|fr|ga|gl|he|hi|hr|hy|ia|is|it|ja|ko|la|lo|lt|lv|ml|mr|nl|no|nn|oc|pl|pt|pt_BR|ro|ru|sa|se|sk|sl|sq|sr|sv|ta|te|th|tk|tr|uk|ur|vi|zh)/[A-Za-z0-9._-].+?[.](?:sst|ssm)$");
#+END_SRC

* 1. output ctRegex

- [[./doc-reform.org][doc-reform.org]]  [[./][org/]]

https://dlang.org/phobos/std_regex.html
- Plain string, in which case it's compiled to bytecode before matching.
- Regex!char (wchar/dchar) that contains a pattern in the form of compiled bytecode.
- StaticRegex!char (wchar/dchar) that contains a pattern in the form of compiled native machine code.

** _module template_ :module:output:

#+HEADER: :tangle "../src/doc_reform/io_out/rgx.d"
#+HEADER: :noweb yes
#+BEGIN_SRC d
<<doc_header_including_copyright_and_license>>
/++
  regex: regular expressions used in sisu document parser
+/
module doc_reform.io_out.rgx;
static template spineRgxOut() {
  static struct RgxO {
    <<makes>>
    <<prgmkup_rgx_spaces>>
    <<prgmkup_rgx_filename_and_path>>
    <<prgmkup_rgx_inline_breaks>>
    <<prgmkup_rgx_inline_quotes>>
    <<prgmkup_rgx_internal_footnotes_and_endnotes>>
    <<prgmkup_rgx_inline_links>>
    <<prgmkup_rgx_inline_font_face>>
    <<prgmkup_rgx_table>>
    <<sp_ch_xhtml_rgx>>
    <<sp_ch_latex_rgx>>
    <<grouped_text_rgx_paragraph_marks>>
  }
}
#+END_SRC

** make
*** various

#+NAME: makes
#+BEGIN_SRC d
static make_breakpage                           = ctRegex!(`new=(?P<breakpage>.+?)(?:;|$)`);
static make_breakcolumn                         = ctRegex!(`break=(?P<breakcolumn>.+?)(?:;|$)`,);
#+END_SRC

** special characters
*** xhtml special characters

#+NAME: sp_ch_xhtml_rgx
#+BEGIN_SRC d
static xhtml_ampersand                          = ctRegex!(`[&]`, "m");      // &amp;
static xhtml_quotation                          = ctRegex!(`["]`, "m");      // &quot;
static xhtml_less_than                          = ctRegex!(`[<]`, "m");      // &lt;
static xhtml_greater_than                       = ctRegex!(`[>]`, "m");      // &gt;
static xhtml_line_break                         = ctRegex!(` [\\]{2}`, "m"); // <br />
#+END_SRC

*** latex special characters

#+NAME: sp_ch_latex_rgx
#+BEGIN_SRC d
static latex_special_char                       = ctRegex!(`([%${}_#&\\])`);
static latex_special_char_for_escape            = ctRegex!(`([%${}_#\\])`);
static latex_special_char_for_escape_and_braces = ctRegex!(`([&])`);
static latex_special_char_for_escape_url        = ctRegex!(`([%])`);
static latex_special_char_escaped               = ctRegex!(`\\([%${}_#\\])`);
static latex_special_char_escaped_braced        = ctRegex!(`[{]\\([&])[}]`);
static latex_identify_inline_link               = ctRegex!(`┥.+?┝┤\S+?├`, "mg");
static latex_identify_inline_fontface           = ctRegex!(`\\([_#$]┨.+?┣)\\([_#$])`, "mg");
static latex_clean_internal_link                = ctRegex!(`^(?:#|¤\S+?#)`, "m");
static latex_clean_bookindex_linebreak          = ctRegex!(`\s*\\\\\\\\\s*`, "m");
#+END_SRC

* 2. ctRegex defaults shared by meta & output (generic)
** misc generic

#+NAME: prgmkup_rgx_spaces
#+BEGIN_SRC d
static newline                                  = ctRegex!("\n", "mg");
static space                                    = ctRegex!(`[ ]`, "mg");
static spaces_keep                              = ctRegex!(`(?P<keep_spaces>^[ ]+|[ ]{2,})`, "mg"); // code, verse, block
static spaces_line_start                        = ctRegex!(`^(?P<opening_spaces>[ ]+)`, "mg");
static nbsp_char                                = ctRegex!(`░`, "mg");
static nbsp_chars                               = ctRegex!(`[░]+`, "mg");
static middle_dot                               = ctRegex!(`·`, "mg");
#+END_SRC

** filename (& path) (including insert file) :insert:file:path:filename:

#+NAME: prgmkup_rgx_filename_and_path
#+BEGIN_SRC d
static src_pth_sst_or_ssm                       = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.](?P<extension>ss[tm]))$`);
static src_pth_pod_sst_or_ssm                   = ctRegex!(`^(?P<podpath>[/]?(?:[a-zA-Z0-9._-]+/)*)media/text/[a-z]{2}/(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*?[.]ss[tm])$`);
static src_pth_contents                         = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9][a-zA-Z0-9._-]*)/pod[.]manifest$`);
static src_pth_zip                              = ctRegex!(`^(?P<path>[/]?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]zip)$`);
static src_pth_types                            = ctRegex!(`^(?P<path>[/]?[a-zA-Z0-9._-]+/)*(?P<gotfile>(?P<filename>[a-zA-Z0-9._-]+[.]ss[tm])|(?P<filelist>[a-zA-Z0-9._-]+/pod[.]manifest)|(?P<filezip>[a-zA-Z0-9._-]+[.]zip))$`);
static src_fn                                   =
  ctRegex!(`^([/]?(?:[a-zA-Z0-9._-]+/)*)(?P<fn_src>(?P<fn_base>[a-zA-Z0-9._-]+)[.](?P<fn_src_suffix>ss[tm]))$`);
static src_fn_master                            = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ssm)$`);
static src_fn_find_inserts                      = ctRegex!(`^(?P<path>/?(?:[a-zA-Z0-9._-]+/)*)(?P<filename>[a-zA-Z0-9._-]+[.]ss[im])$`);
static insert_src_fn_ssi_or_sst                 = ctRegex!(`^<<\s*(?P<path>[a-zA-Z0-9._-]+/)*(?P<filename>[a-zA-Z0-9._-]+[.]ss[ti])$`);
static src_base_parent_dir_name                 = ctRegex!(`[/](?P<dir>(?:[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
static src_formalised_file_path_parts           = ctRegex!(`(?P<pth>(?:[/a-zA-Z0-9._-]+?)(?P<dir>[a-zA-Z0-9._-]+))(?:/media/text/[a-z]{2})$`); // formalizes dir structure
#+END_SRC

** inline markup

*** inline breaks

#+NAME: prgmkup_rgx_inline_breaks
#+BEGIN_SRC d
/+ line breaks +/
static br_empty_line                            = ctRegex!(`\n[ ]*\n`, "mg");
static br_linebreaks_newlines                   = ctRegex!(`[\n┘┙]`, "mg");
static br_linebreaks                            = ctRegex!(`[┘┙]`, "mg");
static br_line                                  = ctRegex!(`┘`, "mg");
static br_line_inline                           = ctRegex!(`┙`, "mg");
static br_line_spaced                           = ctRegex!(`┚`, "mg");
#+END_SRC

#+BEGIN_SRC d
// static brln                                           = ctRegex!(`(?:\\\\)+`, "mg");
#+END_SRC

*** quote marks

#+NAME: prgmkup_rgx_inline_quotes
#+BEGIN_SRC d
/+ quotation marks +/
static quotes_open_and_close                    = ctRegex!(`[“”]`, "mg");
#+END_SRC

#+BEGIN_SRC d
// static quote_open                                    = ctRegex!(`[“]`, "mg");
// static quote_close                                   = ctRegex!(`[”]`, "mg");
#+END_SRC

*** inline markup font face mod :inline:font:face:

#+NAME: prgmkup_rgx_font_face
#+BEGIN_SRC d
/+ inline markup font face mod +/
static inline_mark_emphasis                     = ctRegex!(`(?P<mark>[*])\{(?P<text>.+?)\}[*]`, "mg");
static inline_mark_bold                         = ctRegex!(`(?P<mark>[!])\{(?P<text>.+?)\}[!]`, "mg");
static inline_mark_underscore                   = ctRegex!(`(?P<mark>[_])\{(?P<text>.+?)\}[_]`, "mg");
static inline_mark_italics                      = ctRegex!(`(?P<mark>[/])\{(?P<text>.+?)\}[/]`, "mg");
static inline_mark_superscript                  = ctRegex!(`(?P<mark>\^)\{(?P<text>.+?)\}\^`, "mg");
static inline_mark_subscript                    = ctRegex!(`(?P<mark>[,])\{(?P<text>.+?)\}[,]`, "mg");
static inline_mark_strike                       = ctRegex!(`(?P<mark>[-])\{(?P<text>.+?)\}[-]`, "mg");
static inline_mark_insert                       = ctRegex!(`(?P<mark>[+])\{(?P<text>.+?)\}[+]`, "mg");
static inline_mark_mono                         = ctRegex!(`(?P<mark>[#])\{(?P<text>.+?)\}[#]`, "mg");
static inline_mark_cite                         = ctRegex!(`(?P<mark>["])\{(?P<text>.+?)\}["]`, "mg");
#+END_SRC

#+NAME: prgmkup_rgx_font_face_line
#+BEGIN_SRC d
static inline_faces_line                        = ctRegex!(`^[*!/_]_ (?P<text>.+?)((?: [\\]{2}|[~]#){0,2}$)`);
static inline_emphasis_line                     = ctRegex!(`^\*_ (?P<text>.+?)(?P<tail>(?: [\\]{2}|[~]#){0,2}$)`);
static inline_bold_line                         = ctRegex!(`^!_ (?P<text>.+?)(?P<tail>(?: [\\]{2}|[~]#){0,2}$)`);
static inline_italics_line                      = ctRegex!(`^/_ (?P<text>.+?)(?P<tail>(?: [\\]{2}|[~]#){0,2}$)`);
static inline_underscore_line                   = ctRegex!(`^__ (?P<text>.+?)(?P<tail>(?: [\\]{2}|[~]#){0,2}$)`);
#+END_SRC

*** inline (internal program) markup footnotes endnotes :inline:footnote:

#+NAME: prgmkup_rgx_internal_footnotes_and_endnotes
#+BEGIN_SRC d
/+ inline markup footnotes endnotes +/
static inline_notes_al                          = ctRegex!(`【(?:[*+]\s+|\s*)(.+?)】`, "mg");
static inline_notes_al_special                  = ctRegex!(`【(?:[*+]\s+)(.+?)】`, "mg"); // TODO remove match when special footnotes are implemented
static inline_notes_al_gen                      = ctRegex!(`【.+?】`, "m");
static inline_notes_al_gen_text                 = ctRegex!(`【(?P<text>.+?)】`, "m");
static inline_notes_al_all_note                 = ctRegex!(`【(?P<num>\d+|(?:[*]|[+])+)\s+(?P<note>.+?)\s*】`, "mg");
static inline_notes_al_regular_number_note      = ctRegex!(`【(?P<num>\d+)\s+(?P<note>.+?)\s*】`, "mg");
static inline_notes_al_special_char_note        = ctRegex!(`【(?P<char>(?:[*]|[+])+)\s+(?P<note>.+?)】`, "mg");
static inline_al_delimiter_open_regular         = ctRegex!(`【\s`, "m");
static inline_al_delimiter_open_symbol_star     = ctRegex!(`【[*]\s`, "m");
static inline_al_delimiter_open_symbol_plus     = ctRegex!(`【[+]\s`, "m");
static inline_text_and_note_al_                 = ctRegex!(`(.+?(?:【[*+]*\s+.+?】|.+))`, "mg");
#+END_SRC

#+BEGIN_SRC d
// static inline_notes_al_regular                        = ctRegex!(`【(.+?)】`, "mg");
// static inline_notes_al_gen_ref                        = ctRegex!(`【(?P<ref>[*+]\s+)\s*(?P<text>.+?)】`, "mg");
#+END_SRC

*** inline links

#+NAME: prgmkup_rgx_inline_links
#+BEGIN_SRC d
/+ inline markup links +/
static inline_image                             = ctRegex!(`(?P<pre>┥)☼(?P<imginf>(?P<img>[a-zA-Z0-9._-]+?\.(?:jpg|gif|png)),w(?P<width>\d+)h(?P<height>\d+))\s*(?P<post>.*?┝┤.*?├)`, "mg");
static inline_image_without_dimensions          = ctRegex!(`(?P<pre>┥)☼(?P<imginf>(?P<img>[a-zA-Z0-9._-]+?\.(?:jpg|gif|png)),w(?P<width>0)h(?P<height>0))\s*(?P<post>.*?┝┤.*?├)`, "mg");
static inline_image_info                        = ctRegex!(`☼?(?P<img>[a-zA-Z0-9._-]+?\.(?:jpg|gif|png)),w(?P<width>\d+)h(?P<height>\d+)`, "mg");
static inline_link_anchor                       = ctRegex!(`┃(?P<anchor>\S+?)┃`, "mg"); // TODO *~text_link_anchor
static inline_link                              = ctRegex!(`┥(?P<text>.+?)┝┤(?P<link>#?(\S+?))├`, "mg");
static inline_link_empty                        = ctRegex!(`┥(?P<text>.+?)┝┤├`, "mg");
static inline_link_number                       = ctRegex!(`┥(?P<text>.+?)┝┤(?P<num>[0-9]+)├`, "mg"); // not used
static inline_link_number_only                  = ctRegex!(`(?P<linked_text>┥.+?┝)┤(?P<num>[0-9]+)├`, "mg");
static inline_link_stow_uri                     = ctRegex!(`┥(?P<text>.+?)┝┤(?P<link>[^ 0-9#┥┝┤├][^ 0-9┥┝┤├]+)├`, "mg"); // will not stow (stowed links) or object number internal links
static inline_link_hash                         = ctRegex!(`┥(?P<text>.+?)┝┤(?P<link>#(?P<hash>\S+?))├`, "mg");
static inline_link_seg_and_hash                 = ctRegex!(`┥(?P<text>.+?)┝┤(?P<link>(?P<seg>[^/#├]*)#(?P<hash>.+?))├`, "mg");
static inline_link_clean                        = ctRegex!(`┤(?:.+?)├|[┥┝]`, "mg");
static inline_link_toc_to_backmatter            = ctRegex!(`┤#(?P<link>endnotes|bibliography|bookindex|glossary|blurb)├`, "mg");
static url                                      = ctRegex!(`https?://`, "mg");
static uri                                      = ctRegex!(`(?:https?|git)://`, "mg");
static uri_identify_components                  = ctRegex!(`(?P<type>(?:https?|git)://)(?P<path>\S+?/)(?P<file>[^/]+)$`, "mg");
static inline_link_subtoc                       = ctRegex!(`^(?P<level>[5-7])~ ┥(?P<text>.+?)┝┤(?P<link>.+?)├`, "mg");
static inline_link_fn_suffix                    = ctRegex!(`¤(.+?)(\.fnSuffix)`, "mg");
static inline_seg_link                          = ctRegex!(`(¤)(?:.+?)\.fnSuffix`, "mg");
static mark_internal_site_lnk                   = ctRegex!(`¤`, "mg");
static quotation_mark_sql_insert_delimiter      = ctRegex!("[']", "mg");
#+END_SRC

*** inline markup font face mod :inline:font:face:

#+NAME: prgmkup_rgx_inline_font_face
#+BEGIN_SRC d
/+ inline markup font face mod +/
static inline_emphasis                          = ctRegex!(`[*]┨(?P<text>.+?)┣[*]`, "mg");
static inline_bold                              = ctRegex!(`[!]┨(?P<text>.+?)┣[!]`, "mg");
static inline_underscore                        = ctRegex!(`[_]┨(?P<text>.+?)┣[_]`, "mg");
static inline_italics                           = ctRegex!(`[/]┨(?P<text>.+?)┣[/]`, "mg");
static inline_superscript                       = ctRegex!(`\^┨(?P<text>.+?)┣\^`, "mg");
static inline_subscript                         = ctRegex!(`[,]┨(?P<text>.+?)┣[,]`, "mg");
static inline_strike                            = ctRegex!(`[-]┨(?P<text>.+?)┣[-]`, "mg");
static inline_insert                            = ctRegex!(`[+]┨(?P<text>.+?)┣[+]`, "mg");
static inline_mono                              = ctRegex!(`[■]┨(?P<text>.+?)┣[■]`, "mg");
static inline_cite                              = ctRegex!(`[‖]┨(?P<text>.+?)┣[‖]`, "mg");
#+END_SRC

#+BEGIN_SRC d
// static inline_superscript                    = ctRegex!(`[\^]┨(?P<text>.+?)┣[\^]`, "mg");
// static inline_fontface_clean                 = ctRegex!(`[*!_/^,+■‖-]┨|┣[*!_/^,+■‖-]`, "mg");
#+END_SRC

*** table related

#+NAME: prgmkup_rgx_table
#+BEGIN_SRC d
/+ table delimiters +/
static table_delimiter_col                      = ctRegex!("[ ]*[┊][ ]*", "mg");
static table_delimiter_row                      = ctRegex!("[ ]*\n", "mg");
#+END_SRC

* document header including copyright & license

#+NAME: doc_header_including_copyright_and_license
#+BEGIN_SRC txt
/+
- Name: Spine, Doc Reform [a part of]
  - Description: documents, structuring, processing, publishing, search
    - static content generator

  - Author: Ralph Amissah
    [ralph.amissah@gmail.com]

  - Copyright: (C) 2015 - 2022 Ralph Amissah, All Rights
    Reserved.

  - License: AGPL 3 or later:

    Spine (SiSU), a framework for document structuring, publishing and
    search

    Copyright (C) Ralph Amissah

    This program is free software: you can redistribute it and/or modify it
    under the terms of the GNU AFERO General Public License as published by the
    Free Software Foundation, either version 3 of the License, or (at your
    option) any later version.

    This program is distributed in the hope that it will be useful, but WITHOUT
    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
    FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
    more details.

    You should have received a copy of the GNU General Public License along with
    this program. If not, see [https://www.gnu.org/licenses/].

    If you have Internet connection, the latest version of the AGPL should be
    available at these locations:
    [https://www.fsf.org/licensing/licenses/agpl.html]
    [https://www.gnu.org/licenses/agpl.html]

  - Spine (by Doc Reform, related to SiSU) uses standard:
    - docReform markup syntax
      - standard SiSU markup syntax with modified headers and minor modifications
    - docReform object numbering
      - standard SiSU object citation numbering & system

  - Homepages:
    [https://www.doc_reform.org]
    [https://www.sisudoc.org]

  - Git
    [https://git.sisudoc.org/projects/?p=software/spine.git;a=summary]

+/
#+END_SRC

* __END__