/** * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several * chunks of tokens (one chunk per top-level block matched) and eventually an * end event. Tokens map to HTML tags as far as possible, with custom tokens * used where further processing on the token stream is needed. */ { var pegIncludes = options.pegIncludes; var pegTokenizer = options.pegTokenizer; var env = pegTokenizer.env; var pipelineOpts = pegTokenizer.options; var DU = pegIncludes.DOMUtils; var Util = pegIncludes.Util; var JSUtils = pegIncludes.JSUtils; var PegTokenizer = pegIncludes.PegTokenizer; var defines = pegIncludes.defines; var constants = pegIncludes.constants; var tu = pegIncludes.tu; // define some constructor shortcuts var KV = defines.KV; var TagTk = defines.TagTk; var SelfclosingTagTk = defines.SelfclosingTagTk; var EndTagTk = defines.EndTagTk; var NlTk = defines.NlTk; var CommentTk = defines.CommentTk; var EOFTk = defines.EOFTk; var lastItem = JSUtils.lastItem; var inlineBreaks = tu.inlineBreaks; var stops = new tu.SyntaxStops(); var prevOffset = 0; // Some shorthands for legibility var startOffset = function() { return location().start.offset; }; var endOffset = function() { return location().end.offset; }; var tsrOffsets = function(flag) { return tu.tsrOffsets(location(), flag); }; /* * Emit a chunk of tokens to our consumers. Once this has been done, the * current expression can return an empty list (true). */ var emitChunk = function(tokens) { // Shift tsr of all tokens by the pipeline offset Util.shiftTokenTSR(tokens, options.pipelineOffset); env.log("trace/peg", pegTokenizer.pipelineId, "----> ", tokens); var i; var n = tokens.length; // Enforce parsing resource limits for (i = 0; i < n; i++) { tu.enforceParserResourceLimits(env, tokens[i]); } // limit the size of individual chunks var chunkLimit = 100000; if (n > chunkLimit) { i = 0; while (i < n) { options.cb(tokens.slice(i, i + chunkLimit)); i += chunkLimit; } } else { options.cb(tokens); } }; /* ------------------------------------------------------------------------ * Extension tags should be parsed with higher priority than anything else. * * The trick we use is to strip out the content inside a matching tag-pair * and not tokenize it. The content, if it needs to parsed (for example, * for , <*include*> tags), is parsed in a fresh tokenizer context * which means any error correction that needs to happen is restricted to * the scope of the extension content and doesn't spill over to the higher * level. Ex:

).)*-->/g, "")
// but, as always, things around here are a little more complicated.
//
// We accept the same comments, but because we emit them as HTML comments
// instead of deleting them, we have to encode the data to ensure that
// we always emit a valid HTML5 comment.  See the encodeComment helper
// for further details.

comment
    = ' " .)* ('-->' / eof) {
        var data = DU.encodeComment(c);
        return [new CommentTk(data, { tsr: tsrOffsets() })];
    }


// Behavior switches. See:
// https://www.mediawiki.org/wiki/Help:Magic_words#Behavior_switches
behavior_switch
  = bs:$('__' behavior_text '__') {
    if (env.conf.wiki.isMagicWord(bs)) {
      return [
        new SelfclosingTagTk('behavior-switch', [ new KV('word', bs) ],
          { tsr: tsrOffsets(), src: bs, magicSrc: bs }
        ),
      ];
    } else {
      return [ bs ];
    }
  }

// Instead of defining a charset, php's doDoubleUnderscore concats a regexp of
// all the language specific aliases of the behavior switches and then does a
// match and replace. Just be as permissive as possible and let the
// BehaviorSwitchPreprocessor back out of any overreach.
behavior_text = $( !'__' [^'"<~[{\n\r:;\]}|!=] )+


/**************************************************************
 * External (bracketed and autolinked) links
 **************************************************************/

autolink
  = ! { return stops.onStack('extlink'); }
    // this must be a word boundary, so previous character must be non-word
    ! { return /\w/.test(input[endOffset() - 1] || ''); }
  r:(
      // urllink, inlined
      target:autourl {
        var res = [new SelfclosingTagTk('urllink', [new KV('href', target)], { tsr: tsrOffsets() })];
          return res;
      }
    / autoref
    / isbn) { return r; }

extlink "extlink"
  = ! { return stops.onStack('extlink'); } // extlink cannot be nested
  r:(
        "["
        & { return stops.push('extlink', true); }
        addr:(url_protocol urladdr / "")
        target:(extlink_preprocessor_text / "")
        & {
          // Protocol must be valid and there ought to be at least one
          // post-protocol character.  So strip last char off target
          // before testing protocol.
          var flat = tu.flattenString([addr, target]);
          if (Array.isArray(flat)) {
             // There are templates present, alas.
             return flat.length > 0;
          }
          return Util.isProtocolValid(flat.slice(0, -1), env);
        }
        sp:$( space / unispace )*
        targetOff:( "" { return endOffset(); })
        content:inlineline?
        "]" {
            stops.pop('extlink');
            return [
                new SelfclosingTagTk('extlink', [
                    new KV('href', tu.flattenString([addr, target])),
                    new KV('mw:content', content || ''),
                    new KV('spaces', sp),
                ], {
                    targetOff: targetOff,
                    tsr: tsrOffsets(),
                    contentOffsets: [targetOff, endOffset() - 1],
                }),
            ];
        }
      / "[" & { return stops.pop('extlink'); }
    ) { return r; }

autoref
  = ref:('RFC' / 'PMID') sp:space_or_nbsp+ identifier:$[0-9]+ end_of_word
{
    var base_urls = {
      'RFC': '//tools.ietf.org/html/rfc%s',
      'PMID': '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract',
    };
    var url = tu.sprintf(base_urls[ref], identifier);

    return [
        new SelfclosingTagTk('extlink', [
           new KV('href', tu.sprintf(base_urls[ref], identifier)),
           new KV('mw:content', tu.flattenString([ref, sp, identifier])),
           new KV('typeof', 'mw:ExtLink/' + ref),
        ],
        { stx: "magiclink", tsr: tsrOffsets() }),
    ];
}

isbn
  = 'ISBN' sp:space_or_nbsp+ isbn:(
      [0-9]
      (s:space_or_nbsp_or_dash &[0-9] { return s; } / [0-9])+
      ((space_or_nbsp_or_dash / "") [xX] / "")
    ) isbncode:(
      end_of_word
      {
        // Convert isbn token-and-entity array to stripped string.
        return tu.flattenStringlist(isbn).filter(function(e) {
          return e.constructor === String;
        }).join('').replace(/[^\dX]/ig, '').toUpperCase();
      }
    ) &{
       // ISBNs can only be 10 or 13 digits long (with a specific format)
       return isbncode.length === 10 ||
             (isbncode.length === 13 && /^97[89]/.test(isbncode));
    } {
      return [
        new SelfclosingTagTk('extlink', [
           new KV('href', 'Special:BookSources/' + isbncode),
           new KV('mw:content', tu.flattenString(['ISBN', sp, isbn])),
           new KV('typeof', 'mw:WikiLink/ISBN'),
        ],
        { stx: "magiclink", tsr: tsrOffsets() }),
      ];
}


/* Default URL protocols in MediaWiki (see DefaultSettings). Normally
 * these can be configured dynamically. */

url_protocol =
    & { return Util.isProtocolValid(input.substr(endOffset()), env); }
    p:$( '//' / [A-Za-z] [-A-Za-z0-9+.]* ':' '//'? ) { return p; }

// no punctuation, and '{<' to trigger directives
no_punctuation_char = [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]

// this is the general url rule
// on the PHP side, the path part matches EXT_LINK_URL_CLASS
// which is '[^][<>"\\x00-\\x20\\x7F\p{Zs}]'
// the 's' and 'r' pieces below match the characters in
// EXT_LINK_URL_CLASS which aren't included in no_punctuation_char
url "url"
  = proto:url_protocol
    addr:(urladdr / "")
    path:(  ( !inline_breaks
              c:no_punctuation_char
              { return c; }
            )
            / s:[.:,']  { return s; }
            / comment
            / tplarg_or_template
            / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
                r:(
                    & "&" he:htmlentity { return he; }
                  / [&%{]
                ) { return r; }
         )*
         // Must be at least one character after the protocol
         & { return addr.length > 0 || path.length > 0; }
{
    return tu.flattenString([proto, addr].concat(path));
}

// this is the somewhat-restricted rule used in autolinks
// See Parser::doMagicLinks and Parser.php::makeFreeExternalLink.
// The `path` portion matches EXT_LINK_URL_CLASS, as in the general
// url rule.  As in PHP, we do some fancy fixup to yank out
// trailing punctuation, perhaps including parentheses.
// The 's' and 'r' pieces match the characters in EXT_LINK_URL_CLASS
// which aren't included in no_punctuation_char
autourl
  = &{ return stops.push('autourl', { sawLParen: false }); }
    ! '//' // protocol-relative autolinks not allowed (T32269)
    r:(
    proto:url_protocol
    addr:(urladdr / "")
    path:(  ( !inline_breaks
              ! "("
              c:no_punctuation_char
              { return c; }
            )
            / "(" { stops.onStack('autourl').sawLParen = true; return "("; }
            / [.:,]
            / $(['] ![']) // single quotes are ok, double quotes are bad
            / comment
            / tplarg_or_template
            / ! ( rhe:raw_htmlentity &{ return /^[<>\u00A0]$/.test(rhe); } )
                r:(
                    & "&" he:htmlentity { return he; }
                  / [&%{]
                ) { return r; }
         )*
{
    // as in Parser.php::makeFreeExternalLink, we're going to
    // yank trailing punctuation out of this match.
    var url = tu.flattenStringlist([proto, addr].concat(path));
    // only need to look at last element; HTML entities are strip-proof.
    var last = lastItem(url);
    var trim = 0;
    if (last && last.constructor === String) {
      var strip = ',;\\.:!?';
      if (!stops.onStack('autourl').sawLParen) {
        strip += ')';
      }
      strip = new RegExp('[' + Util.escapeRegExp(strip) + ']*$');
      trim = strip.exec(last)[0].length;
      url[url.length - 1] = last.slice(0, last.length - trim);
    }
    url = tu.flattenStringlist(url);
    if (url.length === 1 && url[0].constructor === String && url[0].length <= proto.length) {
      return null; // ensure we haven't stripped everything: T106945
    }
    peg$currPos -= trim;
    stops.pop('autourl');
    return url;
} ) &{ return r !== null; } {return r; }
    / &{ return stops.pop('autourl'); }

// This is extracted from EXT_LINK_ADDR in Parser.php: a simplified
// expression to match an IPv6 address.  The IPv4 address and "at least
// one character of a host name" portions are punted to the `path`
// component of the `autourl` and `url` productions
urladdr
  = $( "[" [0-9A-Fa-f:.]+ "]" )

/**************************************************************
 * Templates, -arguments and wikilinks
 **************************************************************/

/*
 * Precedence: template arguments win over templates. See
 * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
 * 4: {{{{·}}}} → {·{{{·}}}·}
 * 5: {{{{{·}}}}} → {{·{{{·}}}·}}
 * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
 * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
 * This is only if close has > 3 braces; otherwise we just match open
 * and close as we find them.
 */
tplarg_or_template
  = &'{{' &{
      // Refuse to recurse beyond 40 levels. Default in the PHP parser
      // is $wgMaxTemplateDepth = 40; This is to prevent crashing from
      // buggy wikitext with lots of unclosed template calls, as in
      // eswiki/Usuario:C%C3%A1rdenas/PRUEBAS?oldid=651094
      if (stops.onCount('templatedepth') === undefined ||
          stops.onCount('templatedepth') < 40) {
        return true;
      } else {
        return false;
      }
    } t:tplarg_or_template_guarded { return t; }

tplarg_or_template_guarded
  = &{ return stops.inc('templatedepth'); }
    r:( &('{{' &('{{{'+ !'{') tplarg) a:(template/broken_template) { return a; }
      / a:$('{' &('{{{'+ !'{'))? b:tplarg { return [a].concat(b); }
      / a:$('{' &('{{' !'{'))? b:template { return [a].concat(b); }
      / a:broken_template { return a; }
    ) {
      stops.dec('templatedepth');
      return r;
    }
    / & { return stops.dec('templatedepth'); }

tplarg_or_template_or_bust "tplarg_or_template_or_bust"
    = r:(tplarg_or_template / .)+ { return tu.flattenIfArray(r); }

template
  = stopLen:("" { return stops.push('preproc', /* {{ */'}}'); })
    t:( template_preproc / &{ return stops.popTo('preproc', stopLen); } )
    { stops.popTo('preproc', stopLen); return t; }

// The PHP preprocessor maintains a single stack of "closing token we
// are currently looking for", with no backtracking.  This means that
// once you see `[[ {{` you are looking only for `}}` -- if that template
// turns out to be broken you will never pop the `}}` and there is no way
// to close the `[[`.  Since the PEG tokenizer in Parsoid uses backtracking
// and parses in a single pass (instead of PHP's split preprocessor/parser)
// we have to be a little more careful when we emulate this behavior.
// If we use a rule like:
//   template = "{{" tplname tplargs* "}}"?
// Then we end up having to reinterpret `tplname tplargs*` as a tlb if it
// turns out we never find the `}}`, which involves a lot of tedious gluing
// tokens back together with fingers crossed we haven't discarded any
// significant newlines/whitespace/etc.  An alternative would be a rule like:
//   broken_template = "{{" tlb
// but again, `template` is used in many different contexts; `tlb` isn't
// necessarily the right one to recursively invoke.  Instead we get the
// broken template off of the PEGjs production stack by returning immediately
// after `{{`, but we leave a "broken token" on top of the preprocessor
// stops stack to indicate we're "still in" the {{ context and shouldn't
// ever inlineBreak for any closing tokens above this one.  For example:
//   [[Foo{{Bar]]
// This will match as:
//   wikilink->text,template->text             --> FAILS looking for }}
//     backtracks, popping "]]" and "}}" off preproc stack
//   wikilink->text,broken_template,text       --> FAILS looking for ]]
//     backtracks, popping "]]" and "broken" off preproc stack
//   broken_wikilink,text,broken_template,text --> OK
//     with ["broken", "broken"] left on the preproc stops stack
// Note that we use stops.popTo() to make sure the preproc stack is
// cleaned up properly during backtracking, even if there were broken-FOO
// productions taken which (deliberately) left elements on the preproc stack.

broken_template
  = &"{{" &{ return stops.push('preproc', 'broken'); }
    // for broken-template,  deliberately fail to pop the preproc stops stack
    t:"{{" { return t; }

template_preproc
  = "{{" nl_comment_space*
    target:template_param_value
    params:(nl_comment_space* "|"
                r:( p0:("" { return endOffset(); })
                    v:nl_comment_space*
                    p:("" { return endOffset(); })
                    &("|" / "}}")
                    { return new KV('', tu.flattenIfArray(v), [p0, p0, p0, p]); } // empty argument
                    / template_param
                  ) { return r; }
            )*
    nl_comment_space*
    inline_breaks "}}" {
      // Insert target as first positional attribute, so that it can be
      // generically expanded. The TemplateHandler then needs to shift it out
      // again.
      params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets));
      var obj = new SelfclosingTagTk('template', params, { tsr: tsrOffsets(), src: text() });
      return obj;
    } / $('{{' space_or_newline+ '}}')

tplarg
  = stopLen:("" { return stops.push('preproc', /* {{ */'}}'); })
    t:(tplarg_preproc / &{ return stops.popTo('preproc', stopLen); } )
    { stops.popTo('preproc', stopLen); return t; }

tplarg_preproc
  = "{{{"
    p:("" { return endOffset(); })
    target:template_param_value?
    params:(nl_comment_space* "|"
                r:( p0:("" { return endOffset(); })
                    v:nl_comment_space*
                    p1:("" { return endOffset(); })
                    &("|" / "}}}")
                    { return { tokens: v, srcOffsets: [p0, p1] }; }  // empty argument
                    / template_param_value
                  ) { return r; }
            )*
    nl_comment_space*
    inline_breaks "}}}" {
      params = params.map(function(o) {
        var s = o.srcOffsets;
        return new KV('', tu.flattenIfArray(o.tokens), [s[0], s[0], s[0], s[1]]);
      });
      if (target === null) { target = { tokens: '', srcOffsets: [p, p, p, p] }; }
      // Insert target as first positional attribute, so that it can be
      // generically expanded. The TemplateHandler then needs to shift it out
      // again.
      params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets));
      var obj = new SelfclosingTagTk('templatearg', params, { tsr: tsrOffsets(), src: text() });
      return obj;
    }

template_param
  = name:template_param_name
    val:(
        kEndPos:("" { return endOffset(); })
        optionalSpaceToken
        "="
        vStartPos:("" { return endOffset(); })
        optionalSpaceToken
        tpv:template_param_value? {
            return { kEndPos: kEndPos, vStartPos: vStartPos, value: (tpv && tpv.tokens) || [] };
        }
    )? {
      if (val !== null) {
          if (val.value !== null) {
            return new KV(name, tu.flattenIfArray(val.value), [startOffset(), val.kEndPos, val.vStartPos, endOffset()]);
          } else {
            return new KV(tu.flattenIfArray(name), '', [startOffset(), val.kEndPos, val.vStartPos, endOffset()]);
          }
      } else {
        return new KV('', tu.flattenIfArray(name), [startOffset(), startOffset(), startOffset(), endOffset()]);
      }
    }
  // empty parameter
  / & [|}] {
    return new KV('', '', [startOffset(), startOffset(), startOffset(), endOffset()]);
  }

template_param_name
  = & { return stops.push('equal', true); }
    tpt:(template_param_text / &'=' { return ''; })
    {
        stops.pop('equal');
        return tpt;
    }

  / & { return stops.pop('equal'); }

template_param_value
  = & { stops.inc('nopre'); return stops.push('equal', false); }
    tpt:template_param_text
    {
        stops.dec('nopre');
        stops.pop('equal');
        return { tokens: tpt, srcOffsets: tsrOffsets() };
    }
  / & { stops.dec('nopre'); return stops.pop('equal'); }

template_param_text
  = & { // re-enable tables within template parameters
        stops.push('table', false);
        stops.push('extlink', false);
        stops.push('templateArg', true);
        stops.push('tableCellArg', false);
        return stops.inc('template');
    }
    il:(nested_block / newlineToken)+ {
        stops.pop('table');
        stops.pop('extlink');
        stops.pop('templateArg');
        stops.pop('tableCellArg');
        stops.dec('template');
        // il is guaranteed to be an array -- so, tu.flattenIfArray will
        // always return an array
        var r = tu.flattenIfArray(il);
        if (r.length === 1 && r[0].constructor === String) {
            r = r[0];
        }
        return r;
    }
  / & { stops.pop('table');
        stops.pop('extlink');
        stops.pop('templateArg');
        stops.pop('tableCellArg');
        return stops.dec('template');
    }

//// Language converter block markup of language variants: -{ ... }-

// Note that "rightmost opening" precedence rule (see
// https://www.mediawiki.org/wiki/Preprocessor_ABNF ) means
// that neither -{{ nor -{{{ are parsed as a -{ token, although
// -{{{{ is (since {{{ has precedence over {{).

lang_variant_or_tpl
  = &('-{' &('{{{'+ !'{') tplarg) a:lang_variant { return a; }
  / a:$('-' &('{{{'+ !'{')) b:tplarg { return [a].concat(b); }
  / a:$('-' &('{{' '{{{'* !'{')) b:template { return [a].concat(b); }
  / &'-{' a:lang_variant { return a; }

broken_lang_variant
  = &{ return stops.push('preproc', 'broken'); }
    // for broken-lang-variant, deliberately fail to pop the stops stack
    r:"-{" { return r; }

lang_variant
  = stopLen:("" { return stops.push('preproc', /* -{ */ '}-'); })
    lv:(lang_variant_preproc / &{ return stops.popTo('preproc', stopLen); })
    { stops.popTo('preproc', stopLen); return lv; }
  / broken_lang_variant

lang_variant_preproc
  = lv0:("-{" { return startOffset(); })
    f:(
       &{ return env.langConverterEnabled(); }
       ff:opt_lang_variant_flags {
         // if flags contains 'R', then don't treat ; or : specially inside.
         if (ff.flags) {
           ff.raw = ff.flags.has('R') || ff.flags.has('N');
         } else if (ff.variants) {
           ff.raw = true;
         }
         return ff;
       } /
       &{ return !env.langConverterEnabled(); }
       "" {
         // if language converter not enabled, don't try to parse inside.
         return { raw: true };
       }
    )
    ts:(
      &{ return f.raw; } lv:lang_variant_text { return [{ text: lv }]; }
      /
      &{ return !f.raw; } lv:lang_variant_option_list { return lv; }
    )
    inline_breaks
    lv1:("}-" { return endOffset(); }) {

      if (!env.langConverterEnabled()) {
        return [ "-{", ts[0].text.tokens, "}-" ];
      }
      var lvsrc = input.substring(lv0, lv1);
      var attribs = [];
      ts.forEach(function(t) {
        // move token strings into KV attributes so that they are
        // properly expanded by early stages of the token pipeline
        ['text','from','to'].forEach(function(fld) {
          if (t[fld] === undefined) { return; }
          var name = 'mw:lv' + attribs.length;
          attribs.push(new KV(name, t[fld].tokens, t[fld].srcOffsets));
          t[fld] = name;
        });
      });
      return [
        new SelfclosingTagTk(
          'language-variant',
           attribs,
           {
             tsr: [lv0, lv1],
             src: lvsrc,
             flags: f.flags && Array.from(f.flags).sort(),
             variants: f.variants && Array.from(f.variants).sort(),
             original: f.original,
             flagSp: f.sp,
             texts: ts,
           }),
      ];
    }

opt_lang_variant_flags
  = f:( ff:lang_variant_flags "|" { return ff; } )? {
    // Collect & separate flags and variants into a set and ordered list
    var flags = new Set();
    var variants = new Set();
    var flagList = [];
    var flagSpace = [];
    var variantList = [];
    var variantSpace = [];
    var useVariants = false;
    var internalSp = []; // internal whitespace, for round-tripping
    if (f !== null) {
      // lang_variant_flags returns arrays in reverse order.
      f.flags.reverse();
      f.sp.reverse();
      var spPtr = 0;
      f.flags.forEach(function(item) {
        if (item.flag) {
          flagSpace.push(f.sp[spPtr++]);
          flags.add(item.flag);
          flagList.push(item.flag);
          flagSpace.push(f.sp[spPtr++]);
        }
        if (item.variant) {
          variantSpace.push(f.sp[spPtr++]);
          variants.add(item.variant);
          variantList.push(item.variant);
          variantSpace.push(f.sp[spPtr++]);
        }
      });
      if (spPtr < f.sp.length) {
        // handle space after a trailing semicolon
        flagSpace.push(f.sp[spPtr]);
        variantSpace.push(f.sp[spPtr]);
      }
    }
    // Parse flags (this logic is from core/languages/ConverterRule.php
    // in the parseFlags() function)
    if (flags.size === 0 && variants.size === 0) {
      flags.add('$S');
    } else if (flags.has('R')) {
      flags = new Set(['R']); // remove other flags
    } else if (flags.has('N')) {
      flags = new Set(['N']); // remove other flags
    } else if (flags.has('-')) {
      flags = new Set(['-']); // remove other flags
    } else if (flags.has('T') && flags.size === 1) {
      flags.add('H');
    } else if (flags.has('H')) {
      // Replace A flag, and remove other flags except T and D
      var nf = new Set(['$+', 'H']);
      if (flags.has('T')) { nf.add('T'); }
      if (flags.has('D')) { nf.add('D'); }
      flags = nf;
    } else if (variants.size > 0) {
      useVariants = true;
    } else {
      if (flags.has('A')) {
        flags.add('$+');
        flags.add('$S');
      }
      if (flags.has('D')) {
        flags.delete('$S');
      }
    }
    if (useVariants) {
      return { variants: variants, original: variantList, sp: variantSpace };
    } else {
      return { flags: flags, original: flagList, sp: flagSpace };
    }
  }

lang_variant_flags
  = sp1:(space_or_newline*) f:lang_variant_flag sp2:(space_or_newline*)
    more:( ";" lang_variant_flags? )? {
    var r = more && more[1] ? more[1] : { sp: [], flags: [] };
    // Note that sp and flags are in reverse order, since we're using
    // right recursion and want to push instead of unshift.
    r.sp.push(sp2.join(''));
    r.sp.push(sp1.join(''));
    r.flags.push(f);
    return r;
  }
  / sp:(space_or_newline*) {
    return { sp: [ sp.join('') ], flags: [] };
  }

lang_variant_flag
  = f:[-+A-Z]           { return { flag: f }; }
  / v:lang_variant_name { return { variant: v }; }
  / b:(!space_or_newline [^{}|;])+ { return { bogus: b.join('') }; /* bad flag */}

lang_variant_name // language variant name, like zh, zh-cn, etc.
  = h:[a-z] t:[-a-z]+ { return h + t.join(''); }
  // Escaped otherwise-unrepresentable language names
  // Primarily for supporting html2html round trips; PHP doesn't support
  // using nowikis here (yet!)
  / nowiki_text

lang_variant_option_list
  = o:lang_variant_option rest:( ";" oo:lang_variant_option { return oo; })*
    tr:( ";" space_or_newline* )? // optional trailing semicolon
    {
      var r = [ o ].concat(rest);
      if (tr) { r.push({ semi: true, sp: tr[1].join('') }); }
      return r;
    }
  / lvtext:lang_variant_text { return [{ text: lvtext }]; }

lang_variant_option
  = sp1:(space_or_newline*) lang:lang_variant_name
    sp2:(space_or_newline*) ":"
    sp3:(space_or_newline*)
    lvtext:(lang_variant_nowiki / lang_variant_text_no_semi)
    {
      return {
        twoway: true,
        lang: lang,
        text: lvtext,
        sp: [sp1.join(''), sp2.join(''), sp3.join('')]
      };
    }
  / sp1:(space_or_newline*)
    from:(lang_variant_nowiki / lang_variant_text_no_semi_or_arrow)
    "=>"
    sp2:(space_or_newline*) lang:lang_variant_name
    sp3:(space_or_newline*) ":"
    sp4:(space_or_newline*)
    to:(lang_variant_nowiki / lang_variant_text_no_semi)
    {
      return {
        oneway: true,
        from: from,
        lang: lang,
        to: to,
        sp: [sp1.join(''), sp2.join(''), sp3.join(''), sp4.join('')]
      };
    }

// html2wt support: If a language name or conversion string can't be
// represented w/o breaking wikitext, just wrap it in a .
// PHP doesn't support this (yet), but Parsoid does.
lang_variant_nowiki
  = start:("" {return startOffset();})
    n:nowiki_text
    end:("" { return endOffset();})
    space_or_newline* {
  return { tokens: [ n ], srcOffsets: [start, end] };
}

lang_variant_text
  = start:("" {return startOffset();})
    tokens:(inlineline / "|" )*
    end:("" {return endOffset();})
    { return { tokens: tokens || [], srcOffsets: [start, end] }; }

lang_variant_text_no_semi
  = & { return stops.push('semicolon', true); }
    lvtext:lang_variant_text
    { stops.pop('semicolon'); return lvtext; }
  / & { return stops.pop('semicolon'); }

lang_variant_text_no_semi_or_arrow
  = & { return stops.push('arrow', true); }
    lvtext:lang_variant_text_no_semi { stops.pop('arrow'); return lvtext; }
  / & { return stops.pop('arrow'); }

wikilink_content
  = ( pipe startPos:("" { return endOffset(); }) lt:link_text? {
        var maybeContent = new KV('mw:maybeContent', lt, [startPos, endOffset()]);
        maybeContent.vsrc = input.substring(startPos, endOffset());
        return maybeContent;
  } )*

wikilink
  = stopLen:("" { return stops.push('preproc', ']]'); })
    w:(wikilink_preproc / &{ return stops.popTo('preproc', stopLen); })
    { stops.popTo('preproc', stopLen); return w; }
  / broken_wikilink

// `broken-link` (see [[:mw:Preprocessor_ABNF]]), but careful because the
// second bracket could start an extlink.  Deliberately leave entry
// on preproc stack since we haven't seen a double-close bracket.
// (See full explanation above broken_template production.)
broken_wikilink
  = &"[[" &{ return stops.push('preproc', 'broken'); }
    a:("[" (extlink / "[")) { return a; }

wikilink_preproc
  = "[["
    target:wikilink_preprocessor_text?
    tpos:("" { return endOffset(); })
    lcs:wikilink_content
    inline_breaks "]]"
  {
      var pipeTrick = (lcs.length === 1 && lcs[0].v === null);
      var textTokens = [];
      if (target === null || pipeTrick) {
        textTokens.push("[[");
        if (target) {
          textTokens.push(target);
        }
        lcs.forEach(function(a) {
          // a is a mw:maybeContent attribute
          textTokens.push("|");
          if (a.v !== null) { textTokens.push(a.v); }
        });
        textTokens.push("]]");
        return textTokens;
      }
      var obj = new SelfclosingTagTk('wikilink');
      var hrefKV = new KV('href', target);
      hrefKV.vsrc = input.substring(startOffset() + 2, tpos);
      // XXX: Point to object with path, revision and input information
      // obj.source = input;
      obj.attribs.push(hrefKV);
      obj.attribs = obj.attribs.concat(lcs);
      obj.dataAttribs = {
          tsr: tsrOffsets(),
          src: text(),
      };
      return [obj];
  }

// Tables are allowed inside image captions.
link_text
  = & {
      // Suppress the flag temporarily in this rule to consume the '=' here.
      stops.push('equal', false);
      return stops.push('linkdesc', true);
    }
    c:((sol full_table_in_link_caption)
       / urltext
       / (!inline_breaks
          r:( inline_element / '[' text_char+ ']' $(&(!']' / ']]')) / . ) { return r; }
         )
    )+ {
      stops.pop('equal');
      stops.pop('linkdesc');
      return tu.flattenStringlist(c);
    }
    / & { stops.pop('equal'); return stops.pop('linkdesc'); }

/* Generic quote rule for italic and bold, further processed in a token
 * stream transformation in doQuotes. Relies on NlTk tokens being emitted
 * for each line of text to balance quotes per line.
 *
 * We are not using a simple pair rule here as we need to support mis-nested
 * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
 * all not context free. */
quote = quotes:$("''" "'"*) {
    // sequences of four or more than five quotes are assumed to start
    // with some number of plain-text apostrophes.
    var plainticks = 0;
    var result = [];
    if (quotes.length === 4) {
        plainticks = 1;
    } else if (quotes.length > 5) {
        plainticks = quotes.length - 5;
    }
    if (plainticks > 0) {
        result.push(quotes.substring(0, plainticks));
    }
    // mw-quote token Will be consumed in token transforms
    var tsr = tsrOffsets();
    tsr[0] += plainticks;
    var mwq = new SelfclosingTagTk('mw-quote', [], { tsr: tsr });
    mwq.value = quotes.substring(plainticks);
    result.push(mwq);
    return result;
}


/***********************************************************
 * Pre and xmlish tags
 ***********************************************************/

// Indented pre blocks differ from their non-indented (purely tag-based)
// cousins by having their contents parsed.
pre_indent
  = l:pre_indent_line
    // keep consuming indented lines unless they start a table
    ls:(s:sol
        !(space* "{|")
        pl:pre_indent_line {
              return s.concat(pl);
        }
    )*
  {
      return l.concat(ls);
  }

// Don't recognize tabs
pre_indent_line = " " l:nested_block_line {
    return [' '].concat(l);
}

// This is only used in directive, but maybe that should be accepting extension
// tags in general?
nowiki
  = &("<" "/"? "nowiki"i !tag_name_chars) x:xmlish_tag { return x; }

// Used by nowiki extension to tokenize html entities.
nowiki_content "nowiki_content"
  = c:(htmlentity / .)* { return tu.flattenIfArray(c); }

// Used by lang_variant productions to protect special language names or
// conversion strings.
nowiki_text
  = extToken:nowiki {
    var txt = Util.getArgInfo(extToken).dict.body.extsrc;
    return Util.decodeEntities(txt);
  }

/* Generic XML-like tags
 *
 * These also cover extensions (including Cite), which will hook into the
 * token stream for further processing. The content of extension tags is
 * parsed as regular inline, but the source positions of the tag are added
 * to allow reconstructing the unparsed text from the input. */

// See http://www.w3.org/TR/html5/syntax.html#tag-open-state and
// following paragraphs.
tag_name_chars = [^\t\n\v />\0]
tag_name = $([A-Za-z] tag_name_chars*)

xmlish_tag
  = & {
      // By the time we get to `doTableStuff` in the php parser, we've already
      // safely encoded element attributes. See 55313f4e in core.
      stops.push('table', false);
      stops.push('tableCellArg', false);
      return true;
    }
    "<" end:"/"?
    name:$(tn:tag_name & {
      return isXMLTag(tn, false);  // NOTE: 'extTag' stop was pushed.
    })
    attribs:generic_newline_attributes
    space_or_newline* // No need to preserve this -- canonicalize on RT via dirty diff
    selfclose:"/"?
    bad_ws:space* // No need to preserve this -- canonicalize on RT via dirty diff
    ">" {
        stops.pop('table');
        stops.pop('tableCellArg');
        stops.pop('extTag');

        var lcName = name.toLowerCase();

        // Extension tags don't necessarily have the same semantics as html tags,
        // so don't treat them as void elements.
        var isVoidElt = Util.isVoidElement(lcName) && !env.conf.wiki.extensionTags.has(lcName);

        // Support