231 lines
7.8 KiB
JavaScript
231 lines
7.8 KiB
JavaScript
|
'use strict';
|
||
|
|
||
|
var TransformStream = require('stream').Transform,
|
||
|
DevNullStream = require('./dev_null_stream'),
|
||
|
inherits = require('util').inherits,
|
||
|
Tokenizer = require('../tokenizer'),
|
||
|
ParserFeedbackSimulator = require('./parser_feedback_simulator'),
|
||
|
mergeOptions = require('../common/merge_options');
|
||
|
|
||
|
/**
|
||
|
* @typedef {Object} SAXParserOptions
|
||
|
*
|
||
|
* @property {Boolean} [locationInfo=false] - Enables source code location information for the tokens.
|
||
|
* When enabled, each token event handler will receive {@link LocationInfo} (or {@link StartTagLocationInfo})
|
||
|
* object as its last argument.
|
||
|
*/
|
||
|
var DEFAULT_OPTIONS = {
|
||
|
locationInfo: false
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Streaming [SAX]{@link https://en.wikipedia.org/wiki/Simple_API_for_XML}-style HTML parser.
|
||
|
* A [transform stream](https://nodejs.org/api/stream.html#stream_class_stream_transform)
|
||
|
* (which means you can pipe *through* it, see example).
|
||
|
* @class SAXParser
|
||
|
* @memberof parse5
|
||
|
* @instance
|
||
|
* @extends stream.Transform
|
||
|
* @param {SAXParserOptions} options - Parsing options.
|
||
|
* @example
|
||
|
* var parse5 = require('parse5');
|
||
|
* var http = require('http');
|
||
|
* var fs = require('fs');
|
||
|
*
|
||
|
* var file = fs.createWriteStream('/home/google.com.html');
|
||
|
* var parser = new parse5.SAXParser();
|
||
|
*
|
||
|
* parser.on('text', function(text) {
|
||
|
* // Handle page text content
|
||
|
* ...
|
||
|
* });
|
||
|
*
|
||
|
* http.get('http://google.com', function(res) {
|
||
|
* // SAXParser is the Transform stream, which means you can pipe
|
||
|
* // through it. So, you can analyze page content and, e.g., save it
|
||
|
* // to the file at the same time:
|
||
|
* res.pipe(parser).pipe(file);
|
||
|
* });
|
||
|
*/
|
||
|
var SAXParser = module.exports = function (options) {
|
||
|
TransformStream.call(this);
|
||
|
|
||
|
this.options = mergeOptions(DEFAULT_OPTIONS, options);
|
||
|
|
||
|
this.tokenizer = new Tokenizer(options);
|
||
|
this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.tokenizer);
|
||
|
|
||
|
this.pendingText = null;
|
||
|
this.currentTokenLocation = void 0;
|
||
|
|
||
|
this.lastChunkWritten = false;
|
||
|
this.stopped = false;
|
||
|
|
||
|
// NOTE: always pipe stream to the /dev/null stream to avoid
|
||
|
// `highWaterMark` hit even if we don't have consumers.
|
||
|
// (see: https://github.com/inikulin/parse5/issues/97#issuecomment-171940774)
|
||
|
this.pipe(new DevNullStream());
|
||
|
};
|
||
|
|
||
|
inherits(SAXParser, TransformStream);
|
||
|
|
||
|
//TransformStream implementation
|
||
|
SAXParser.prototype._transform = function (chunk, encoding, callback) {
|
||
|
if (!this.stopped) {
|
||
|
this.tokenizer.write(chunk.toString('utf8'), this.lastChunkWritten);
|
||
|
this._runParsingLoop();
|
||
|
}
|
||
|
|
||
|
this.push(chunk);
|
||
|
|
||
|
callback();
|
||
|
};
|
||
|
|
||
|
SAXParser.prototype._flush = function (callback) {
|
||
|
callback();
|
||
|
};
|
||
|
|
||
|
SAXParser.prototype.end = function (chunk, encoding, callback) {
|
||
|
this.lastChunkWritten = true;
|
||
|
TransformStream.prototype.end.call(this, chunk, encoding, callback);
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* Stops parsing. Useful if you want the parser to stop consuming CPU time once you've obtained the desired info
|
||
|
* from the input stream. Doesn't prevent piping, so that data will flow through the parser as usual.
|
||
|
*
|
||
|
* @function stop
|
||
|
* @memberof parse5#SAXParser
|
||
|
* @instance
|
||
|
* @example
|
||
|
* var parse5 = require('parse5');
|
||
|
* var http = require('http');
|
||
|
* var fs = require('fs');
|
||
|
*
|
||
|
* var file = fs.createWriteStream('/home/google.com.html');
|
||
|
* var parser = new parse5.SAXParser();
|
||
|
*
|
||
|
* parser.on('doctype', function(name, publicId, systemId) {
|
||
|
* // Process doctype info ans stop parsing
|
||
|
* ...
|
||
|
* parser.stop();
|
||
|
* });
|
||
|
*
|
||
|
* http.get('http://google.com', function(res) {
|
||
|
* // Despite the fact that parser.stop() was called whole
|
||
|
* // content of the page will be written to the file
|
||
|
* res.pipe(parser).pipe(file);
|
||
|
* });
|
||
|
*/
|
||
|
SAXParser.prototype.stop = function () {
|
||
|
this.stopped = true;
|
||
|
};
|
||
|
|
||
|
//Internals
|
||
|
SAXParser.prototype._runParsingLoop = function () {
|
||
|
do {
|
||
|
var token = this.parserFeedbackSimulator.getNextToken();
|
||
|
|
||
|
if (token.type === Tokenizer.HIBERNATION_TOKEN)
|
||
|
break;
|
||
|
|
||
|
if (token.type === Tokenizer.CHARACTER_TOKEN ||
|
||
|
token.type === Tokenizer.WHITESPACE_CHARACTER_TOKEN ||
|
||
|
token.type === Tokenizer.NULL_CHARACTER_TOKEN) {
|
||
|
|
||
|
if (this.options.locationInfo) {
|
||
|
if (this.pendingText === null)
|
||
|
this.currentTokenLocation = token.location;
|
||
|
|
||
|
else
|
||
|
this.currentTokenLocation.endOffset = token.location.endOffset;
|
||
|
}
|
||
|
|
||
|
this.pendingText = (this.pendingText || '') + token.chars;
|
||
|
}
|
||
|
|
||
|
else {
|
||
|
this._emitPendingText();
|
||
|
this._handleToken(token);
|
||
|
}
|
||
|
} while (!this.stopped && token.type !== Tokenizer.EOF_TOKEN);
|
||
|
};
|
||
|
|
||
|
SAXParser.prototype._handleToken = function (token) {
|
||
|
if (this.options.locationInfo)
|
||
|
this.currentTokenLocation = token.location;
|
||
|
|
||
|
if (token.type === Tokenizer.START_TAG_TOKEN)
|
||
|
/**
|
||
|
* Raised when the parser encounters a start tag.
|
||
|
* @event startTag
|
||
|
* @memberof parse5#SAXParser
|
||
|
* @instance
|
||
|
* @type {Function}
|
||
|
* @param {String} name - Tag name.
|
||
|
* @param {Array} attrs - List of attributes in the `{ name: String, value: String, prefix?: String }` form.
|
||
|
* @param {Boolean} selfClosing - Indicates if the tag is self-closing.
|
||
|
* @param {StartTagLocationInfo} [location] - Start tag source code location info.
|
||
|
* Available if location info is enabled in {@link SAXParserOptions}.
|
||
|
*/
|
||
|
this.emit('startTag', token.tagName, token.attrs, token.selfClosing, this.currentTokenLocation);
|
||
|
|
||
|
else if (token.type === Tokenizer.END_TAG_TOKEN)
|
||
|
/**
|
||
|
* Raised then parser encounters an end tag.
|
||
|
* @event endTag
|
||
|
* @memberof parse5#SAXParser
|
||
|
* @instance
|
||
|
* @type {Function}
|
||
|
* @param {String} name - Tag name.
|
||
|
* @param {LocationInfo} [location] - End tag source code location info.
|
||
|
* Available if location info is enabled in {@link SAXParserOptions}.
|
||
|
*/
|
||
|
this.emit('endTag', token.tagName, this.currentTokenLocation);
|
||
|
|
||
|
else if (token.type === Tokenizer.COMMENT_TOKEN)
|
||
|
/**
|
||
|
* Raised then parser encounters a comment.
|
||
|
* @event comment
|
||
|
* @memberof parse5#SAXParser
|
||
|
* @instance
|
||
|
* @type {Function}
|
||
|
* @param {String} text - Comment text.
|
||
|
* @param {LocationInfo} [location] - Comment source code location info.
|
||
|
* Available if location info is enabled in {@link SAXParserOptions}.
|
||
|
*/
|
||
|
this.emit('comment', token.data, this.currentTokenLocation);
|
||
|
|
||
|
else if (token.type === Tokenizer.DOCTYPE_TOKEN)
|
||
|
/**
|
||
|
* Raised then parser encounters a [document type declaration]{@link https://en.wikipedia.org/wiki/Document_type_declaration}.
|
||
|
* @event doctype
|
||
|
* @memberof parse5#SAXParser
|
||
|
* @instance
|
||
|
* @type {Function}
|
||
|
* @param {String} name - Document type name.
|
||
|
* @param {String} publicId - Document type public identifier.
|
||
|
* @param {String} systemId - Document type system identifier.
|
||
|
* @param {LocationInfo} [location] - Document type declaration source code location info.
|
||
|
* Available if location info is enabled in {@link SAXParserOptions}.
|
||
|
*/
|
||
|
this.emit('doctype', token.name, token.publicId, token.systemId, this.currentTokenLocation);
|
||
|
};
|
||
|
|
||
|
SAXParser.prototype._emitPendingText = function () {
|
||
|
if (this.pendingText !== null) {
|
||
|
/**
|
||
|
* Raised then parser encounters text content.
|
||
|
* @event text
|
||
|
* @memberof parse5#SAXParser
|
||
|
* @instance
|
||
|
* @type {Function}
|
||
|
* @param {String} text - Text content.
|
||
|
* @param {LocationInfo} [location] - Text content code location info.
|
||
|
* Available if location info is enabled in {@link SAXParserOptions}.
|
||
|
*/
|
||
|
this.emit('text', this.pendingText, this.currentTokenLocation);
|
||
|
this.pendingText = null;
|
||
|
}
|
||
|
};
|