var includes = require('lodash/includes'); var trimEnd = require('lodash/trimEnd'); var htmlparser = require('htmlparser2'); var helper = require('./helper'); var defaultFormat = require('./formatter'); // Which type of tags should not be parsed var SKIP_TYPES = [ 'style', 'script' ]; function htmlToText(html, options) { options = Object.assign({ wordwrap: 80, tables: [], preserveNewlines: false, uppercaseHeadings: true, singleNewLineParagraphs: false, hideLinkHrefIfSameAsText: false, linkHrefBaseUrl: null, noLinkBrackets: false, noAnchorUrl: true, baseElement: 'body', returnDomByDefault: true, format: {}, decodeOptions: { isAttributeValue: false, strict: false }, longWordSplit: { wrapCharacters: [], forceWrapOnLimit: false }, unorderedListItemPrefix: ' * ' }, options || {}); var handler = new htmlparser.DefaultHandler(function (error, dom) { }, { verbose: true }); new htmlparser.Parser(handler).parseComplete(html); options.lineCharCount = 0; var result = ''; var baseElements = Array.isArray(options.baseElement) ? options.baseElement : [options.baseElement]; for (var idx = 0; idx < baseElements.length; ++idx) { result += walk(filterBody(handler.dom, options, baseElements[idx]), options); } return trimEnd(result); } function filterBody(dom, options, baseElement) { var result = null; var splitTag = helper.splitCssSearchTag(baseElement); function walk(dom) { if (result) return; dom.forEach(function(elem) { if (result) return; if (elem.name === splitTag.element) { var documentClasses = elem.attribs && elem.attribs.class ? elem.attribs.class.split(" ") : []; var documentIds = elem.attribs && elem.attribs.id ? elem.attribs.id.split(" ") : []; if ((splitTag.classes.every(function (val) { return documentClasses.indexOf(val) >= 0; })) && (splitTag.ids.every(function (val) { return documentIds.indexOf(val) >= 0; }))) { result = [elem]; return; } } if (elem.children) walk(elem.children); }); } walk(dom); return options.returnDomByDefault ? result || dom : result; } function containsTable(attr, tables) { if (tables === true) return true; function removePrefix(key) { return key.substr(1); } function checkPrefix(prefix) { return function(key) { return key.startsWith(prefix); }; } function filterByPrefix(tables, prefix) { return tables .filter(checkPrefix(prefix)) .map(removePrefix); } var classes = filterByPrefix(tables, '.'); var ids = filterByPrefix(tables, '#'); return attr && (includes(classes, attr['class']) || includes(ids, attr['id'])); } function walk(dom, options, result) { if (arguments.length < 3) { result = ''; } var whiteSpaceRegex = /\s$/; var format = Object.assign({}, defaultFormat, options.format); if (!dom) { return result; } dom.forEach(function(elem) { switch(elem.type) { case 'tag': switch(elem.name.toLowerCase()) { case 'img': result += format.image(elem, options); break; case 'a': // Inline element needs its leading space to be trimmed if `result` // currently ends with whitespace elem.trimLeadingSpace = whiteSpaceRegex.test(result); result += format.anchor(elem, walk, options); break; case 'p': result += format.paragraph(elem, walk, options); break; case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': result += format.heading(elem, walk, options); break; case 'br': result += format.lineBreak(elem, walk, options); break; case 'hr': result += format.horizontalLine(elem, walk, options); break; case 'ul': result += format.unorderedList(elem, walk, options); break; case 'ol': result += format.orderedList(elem, walk, options); break; case 'pre': var newOptions = Object.assign({}, options); newOptions.isInPre = true; result += format.paragraph(elem, walk, newOptions); break; case 'table': result = containsTable(elem.attribs, options.tables) ? result + format.table(elem, walk, options) : walk(elem.children || [], options, result); break; case 'blockquote': result += format.blockquote(elem, walk, options); break; default: result = walk(elem.children || [], options, result); } break; case 'text': if (elem.data !== '\r\n') { // Text needs its leading space to be trimmed if `result` // currently ends with whitespace elem.trimLeadingSpace = whiteSpaceRegex.test(result); result += format.text(elem, options); } break; default: if (!includes(SKIP_TYPES, elem.type)) { result = walk(elem.children || [], options, result); } } options.lineCharCount = result.length - (result.lastIndexOf('\n') + 1); }); return result; } exports.fromString = function(str, options) { return htmlToText(str, options || {}); };