/**
* Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
* chunks of tokens (one chunk per top-level block matched) and eventually an
* end event. Tokens map to HTML tags as far as possible, with custom tokens
* used where further processing on the token stream is needed.
*/
{
var pegIncludes = options.pegIncludes;
var pegTokenizer = options.pegTokenizer;
var env = pegTokenizer.env;
var pipelineOpts = pegTokenizer.options;
var DU = pegIncludes.DOMUtils;
var Util = pegIncludes.Util;
var JSUtils = pegIncludes.JSUtils;
var PegTokenizer = pegIncludes.PegTokenizer;
var defines = pegIncludes.defines;
var constants = pegIncludes.constants;
var tu = pegIncludes.tu;
// define some constructor shortcuts
var KV = defines.KV;
var TagTk = defines.TagTk;
var SelfclosingTagTk = defines.SelfclosingTagTk;
var EndTagTk = defines.EndTagTk;
var NlTk = defines.NlTk;
var CommentTk = defines.CommentTk;
var EOFTk = defines.EOFTk;
var lastItem = JSUtils.lastItem;
var inlineBreaks = tu.inlineBreaks;
var stops = new tu.SyntaxStops();
var prevOffset = 0;
// Some shorthands for legibility
var startOffset = function() {
return location().start.offset;
};
var endOffset = function() {
return location().end.offset;
};
var tsrOffsets = function(flag) {
return tu.tsrOffsets(location(), flag);
};
/*
* Emit a chunk of tokens to our consumers. Once this has been done, the
* current expression can return an empty list (true).
*/
var emitChunk = function(tokens) {
// Shift tsr of all tokens by the pipeline offset
Util.shiftTokenTSR(tokens, options.pipelineOffset);
env.log("trace/peg", pegTokenizer.pipelineId, "----> ", tokens);
var i;
var n = tokens.length;
// Enforce parsing resource limits
for (i = 0; i < n; i++) {
tu.enforceParserResourceLimits(env, tokens[i]);
}
// limit the size of individual chunks
var chunkLimit = 100000;
if (n > chunkLimit) {
i = 0;
while (i < n) {
options.cb(tokens.slice(i, i + chunkLimit));
i += chunkLimit;
}
} else {
options.cb(tokens);
}
};
/* ------------------------------------------------------------------------
* Extension tags should be parsed with higher priority than anything else.
*
* The trick we use is to strip out the content inside a matching tag-pair
* and not tokenize it. The content, if it needs to parsed (for example,
* for [, <*include*> tags), is parsed in a fresh tokenizer context
* which means any error correction that needs to happen is restricted to
* the scope of the extension content and doesn't spill over to the higher
* level. Ex: ]