Intial Commit

This commit is contained in:
valki
2020-10-17 18:42:50 +02:00
commit 664c6d8ca3
5892 changed files with 759183 additions and 0 deletions

View File

@@ -0,0 +1,268 @@
var max = require('lodash/max');
var compact = require('lodash/compact');
var times = require('lodash/times');
var trimStart = require('lodash/trimStart');
var padEnd = require('lodash/padEnd');
var he = require('he');
var helper = require('./helper');
function formatText(elem, options) {
var text = elem.data || "";
text = he.decode(text, options.decodeOptions);
if (options.isInPre) {
return text;
} else {
return helper.wordwrap(elem.trimLeadingSpace ? trimStart(text) : text, options);
}
}
function formatImage(elem, options) {
if (options.ignoreImage) {
return '';
}
var result = '', attribs = elem.attribs || {};
if (attribs.alt) {
result += he.decode(attribs.alt, options.decodeOptions);
if (attribs.src) {
result += ' ';
}
}
if (attribs.src) {
result += '[' + attribs.src + ']';
}
return (result);
}
function formatLineBreak(elem, fn, options) {
return '\n' + fn(elem.children, options);
}
function formatParagraph(elem, fn, options) {
var paragraph = fn(elem.children, options);
if (options.singleNewLineParagraphs) {
return paragraph + '\n';
} else {
return paragraph + '\n\n';
}
}
function formatHeading(elem, fn, options) {
var heading = fn(elem.children, options);
if (options.uppercaseHeadings) {
heading = heading.toUpperCase();
}
return heading + '\n';
}
// If we have both href and anchor text, format it in a useful manner:
// - "anchor text [href]"
// Otherwise if we have only anchor text or an href, we return the part we have:
// - "anchor text" or
// - "href"
function formatAnchor(elem, fn, options) {
var href = '';
// Always get the anchor text
var storedCharCount = options.lineCharCount;
var text = fn(elem.children || [], options);
if (!text) {
text = '';
}
var result = elem.trimLeadingSpace ? trimStart(text) : text;
if (!options.ignoreHref) {
// Get the href, if present
if (elem.attribs && elem.attribs.href) {
href = elem.attribs.href.replace(/^mailto:/, '');
}
if (href) {
if ((!options.noAnchorUrl) || (options.noAnchorUrl && href[0] !== '#')) {
if (options.linkHrefBaseUrl && href.indexOf('/') === 0) {
href = options.linkHrefBaseUrl + href;
}
if (!options.hideLinkHrefIfSameAsText || href !== helper.replaceAll(result, '\n', '')) {
if (!options.noLinkBrackets) {
result += ' [' + href + ']';
} else {
result += ' ' + href;
}
}
}
}
}
options.lineCharCount = storedCharCount;
return formatText({ data: result || href, trimLeadingSpace: elem.trimLeadingSpace }, options);
}
function formatHorizontalLine(elem, fn, options) {
return '\n' + '-'.repeat(options.wordwrap) + '\n\n';
}
function formatListItem(prefix, elem, fn, options) {
options = Object.assign({}, options);
// Reduce the wordwrap for sub elements.
if (options.wordwrap) {
options.wordwrap -= prefix.length;
}
// Process sub elements.
var text = fn(elem.children, options);
// Replace all line breaks with line break + prefix spacing.
text = text.replace(/\n/g, '\n' + ' '.repeat(prefix.length));
// Add first prefix and line break at the end.
return prefix + text + '\n';
}
var whiteSpaceRegex = /^\s*$/;
function formatUnorderedList(elem, fn, options) {
var result = '';
var prefix = options.unorderedListItemPrefix;
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) {
return child.type !== 'text' || !whiteSpaceRegex.test(child.data);
});
nonWhiteSpaceChildren.forEach(function(elem) {
result += formatListItem(prefix, elem, fn, options);
});
return result + '\n';
}
function formatOrderedList(elem, fn, options) {
var result = '';
var nonWhiteSpaceChildren = (elem.children || []).filter(function(child) {
return child.type !== 'text' || !whiteSpaceRegex.test(child.data);
});
// Return different functions for different OL types
var typeFunction = (function() {
// Determine type
var olType = elem.attribs.type || '1';
// TODO Imeplement the other valid types
// Fallback to type '1' function for other valid types
switch(olType) {
case 'a': return function(start, i) { return String.fromCharCode(i + start + 97);};
case 'A': return function(start, i) { return String.fromCharCode(i + start + 65);};
case '1':
default: return function(start, i) { return i + 1 + start;};
}
}());
// Make sure there are list items present
if (nonWhiteSpaceChildren.length) {
// Calculate initial start from ol attribute
var start = Number(elem.attribs.start || '1') - 1;
// Calculate the maximum length to i.
var maxLength = (nonWhiteSpaceChildren.length + start).toString().length;
nonWhiteSpaceChildren.forEach(function(elem, i) {
// Use different function depending on type
var index = typeFunction(start, i);
// Calculate the needed spacing for nice indentation.
var spacing = maxLength - index.toString().length;
var prefix = ' ' + index + '. ' + ' '.repeat(spacing);
result += formatListItem(prefix, elem, fn, options);
});
}
return result + '\n';
}
function tableToString(table) {
// Determine space width per column
// Convert all rows to lengths
var widths = table.map(function(row) {
return row.map(function(col) {
return col.length;
});
});
// Invert rows with colums
widths = helper.arrayZip(widths);
// Determine the max values for each column
widths = widths.map(function(col) {
return max(col);
});
// Build the table
var text = '';
table.forEach(function(row) {
var i = 0;
row.forEach(function(col) {
text += padEnd(col.trim(), widths[i++], ' ') + ' ';
});
text += '\n';
});
return text + '\n';
}
function formatTable(elem, fn, options) {
var table = [];
elem.children.forEach(tryParseRows);
return tableToString(table);
function tryParseRows(elem) {
if (elem.type !== 'tag') {
return;
}
switch (elem.name.toLowerCase()) {
case "thead":
case "tbody":
case "tfoot":
case "center":
elem.children.forEach(tryParseRows);
return;
case 'tr':
var rows = [];
elem.children.forEach(function(elem) {
var tokens, count;
if (elem.type === 'tag') {
switch (elem.name.toLowerCase()) {
case 'th':
tokens = formatHeading(elem, fn, options).split('\n');
rows.push(compact(tokens));
break;
case 'td':
tokens = fn(elem.children, options).split('\n');
rows.push(compact(tokens));
// Fill colspans with empty values
if (elem.attribs && elem.attribs.colspan) {
count = elem.attribs.colspan - 1 || 0;
times(count, function() {
rows.push(['']);
});
}
break;
}
}
});
rows = helper.arrayZip(rows);
rows.forEach(function(row) {
row = row.map(function(col) {
return col || '';
});
table.push(row);
});
break;
}
}
}
function formatBlockquote(elem, fn, options) {
return '> ' + fn(elem.children, options) + '\n';
}
exports.text = formatText;
exports.image = formatImage;
exports.lineBreak = formatLineBreak;
exports.paragraph = formatParagraph;
exports.anchor = formatAnchor;
exports.heading = formatHeading;
exports.table = formatTable;
exports.orderedList = formatOrderedList;
exports.unorderedList = formatUnorderedList;
exports.listItem = formatListItem;
exports.horizontalLine = formatHorizontalLine;
exports.blockquote = formatBlockquote;

View File

@@ -0,0 +1,139 @@
var zip = require('lodash/zip');
var trimEnd = require('lodash/trimEnd');
// Split a long word up to fit within the word wrap limit. Use either a
// character to split looking back from the word wrap limit, or
// truncate to the word wrap limit.
function splitLongWord(word, options) {
var wrapCharacters = options.longWordSplit.wrapCharacters || [];
var forceWrapOnLimit = options.longWordSplit.forceWrapOnLimit || false;
var max = options.wordwrap;
var fuseWord = [];
var idx = 0;
while (word.length > max) {
var firstLine = word.substr(0, max);
var remainingChars = word.substr(max);
var splitIndex = firstLine.lastIndexOf(wrapCharacters[idx]);
if (splitIndex > -1) {
// We've found a character to split on, store before the split then check if we
// need to split again
word = firstLine.substr(splitIndex + 1) + remainingChars;
fuseWord.push(firstLine.substr(0, splitIndex + 1));
} else {
idx++;
if (idx >= wrapCharacters.length) {
// Cannot split on character, so either split at 'max' or preserve length
if (forceWrapOnLimit) {
fuseWord.push(firstLine);
word = remainingChars;
if (word.length > max) {
continue;
}
} else {
word = firstLine + remainingChars;
if (!options.preserveNewlines) {
word += '\n';
}
}
break;
} else {
word = firstLine + remainingChars;
}
}
}
fuseWord.push(word);
return fuseWord.join('\n');
}
exports.wordwrap = function wordwrap(text, options) {
var max = options.wordwrap;
var preserveNewlines = options.preserveNewlines;
var length = options.lineCharCount;
// Preserve leading space
var result = text.startsWith(' ') ? ' ' : '';
length += result.length;
var buffer = [];
// Split the text into words, decide to preserve new lines or not.
var words = preserveNewlines
? text.trim().replace(/\n/g, '\n ').split(/\ +/)
: text.trim().split(/\s+/);
// Determine where to end line word by word.
words.forEach(function(word) {
// Add buffer to result if we can't fit any more words in the buffer.
if ((max || max === 0) && length > 0 && ((length + word.length > max) || (length + word.indexOf('\n') > max))) {
// Concat buffer and add it to the result
result += buffer.join(' ') + '\n';
// Reset buffer and length
buffer.length = length = 0;
}
// Check if the current word is long enough to be wrapped
if ((max || max === 0) && (options.longWordSplit) && (word.length > max)) {
word = splitLongWord(word, options);
}
buffer.push(word);
// If the word contains a newline then restart the count and add the buffer to the result
if (word.indexOf('\n') !== -1) {
result += buffer.join(' ');
// Reset the buffer, let the length include any characters after the last newline
buffer.length = 0;
length = word.length - (word.lastIndexOf('\n') + 1);
// If there are characters after the newline, add a space and increase the length by 1
if (length) {
result += ' ';
length++;
}
} else {
// Add word length + one whitespace
length += word.length + 1;
}
});
// Add the rest to the result.
result += buffer.join(' ');
// Preserve trailing space
if (!text.endsWith(' ')) {
result = trimEnd(result);
} else if (!result.endsWith(' ')) {
result = result + ' ';
}
return result;
};
exports.arrayZip = function arrayZip(array) {
return zip.apply(null, array);
};
exports.splitCssSearchTag = function splitCssSearchTag(tagString) {
function getParams(re, string) {
var captures = [], found;
while ((found = re.exec(string)) !== null) {
captures.push(found[1]);
}
return captures;
}
var splitTag = {};
var elementRe = /(^\w*)/g;
splitTag.element = elementRe.exec(tagString)[1];
splitTag.classes = getParams( /\.([\d\w-]*)/g, tagString);
splitTag.ids = getParams( /#([\d\w-]*)/g, tagString);
return splitTag;
};
exports.replaceAll = function replaceAll(str, find, replace) {
var reg = new RegExp(find, 'g');
return str.replace(reg, replace);
};

View File

@@ -0,0 +1,188 @@
var includes = require('lodash/includes');
var trimEnd = require('lodash/trimEnd');
var htmlparser = require('htmlparser2');
var helper = require('./helper');
var defaultFormat = require('./formatter');
// Which type of tags should not be parsed
var SKIP_TYPES = [
'style',
'script'
];
function htmlToText(html, options) {
options = Object.assign({
wordwrap: 80,
tables: [],
preserveNewlines: false,
uppercaseHeadings: true,
singleNewLineParagraphs: false,
hideLinkHrefIfSameAsText: false,
linkHrefBaseUrl: null,
noLinkBrackets: false,
noAnchorUrl: true,
baseElement: 'body',
returnDomByDefault: true,
format: {},
decodeOptions: {
isAttributeValue: false,
strict: false
},
longWordSplit: {
wrapCharacters: [],
forceWrapOnLimit: false
},
unorderedListItemPrefix: ' * '
}, options || {});
var handler = new htmlparser.DefaultHandler(function (error, dom) {
}, {
verbose: true
});
new htmlparser.Parser(handler).parseComplete(html);
options.lineCharCount = 0;
var result = '';
var baseElements = Array.isArray(options.baseElement) ? options.baseElement : [options.baseElement];
for (var idx = 0; idx < baseElements.length; ++idx) {
result += walk(filterBody(handler.dom, options, baseElements[idx]), options);
}
return trimEnd(result);
}
function filterBody(dom, options, baseElement) {
var result = null;
var splitTag = helper.splitCssSearchTag(baseElement);
function walk(dom) {
if (result) return;
dom.forEach(function(elem) {
if (result) return;
if (elem.name === splitTag.element) {
var documentClasses = elem.attribs && elem.attribs.class ? elem.attribs.class.split(" ") : [];
var documentIds = elem.attribs && elem.attribs.id ? elem.attribs.id.split(" ") : [];
if ((splitTag.classes.every(function (val) { return documentClasses.indexOf(val) >= 0; })) &&
(splitTag.ids.every(function (val) { return documentIds.indexOf(val) >= 0; }))) {
result = [elem];
return;
}
}
if (elem.children) walk(elem.children);
});
}
walk(dom);
return options.returnDomByDefault ? result || dom : result;
}
function containsTable(attr, tables) {
if (tables === true) return true;
function removePrefix(key) {
return key.substr(1);
}
function checkPrefix(prefix) {
return function(key) {
return key.startsWith(prefix);
};
}
function filterByPrefix(tables, prefix) {
return tables
.filter(checkPrefix(prefix))
.map(removePrefix);
}
var classes = filterByPrefix(tables, '.');
var ids = filterByPrefix(tables, '#');
return attr && (includes(classes, attr['class']) || includes(ids, attr['id']));
}
function walk(dom, options, result) {
if (arguments.length < 3) {
result = '';
}
var whiteSpaceRegex = /\s$/;
var format = Object.assign({}, defaultFormat, options.format);
if (!dom) {
return result;
}
dom.forEach(function(elem) {
switch(elem.type) {
case 'tag':
switch(elem.name.toLowerCase()) {
case 'img':
result += format.image(elem, options);
break;
case 'a':
// Inline element needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.anchor(elem, walk, options);
break;
case 'p':
result += format.paragraph(elem, walk, options);
break;
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
result += format.heading(elem, walk, options);
break;
case 'br':
result += format.lineBreak(elem, walk, options);
break;
case 'hr':
result += format.horizontalLine(elem, walk, options);
break;
case 'ul':
result += format.unorderedList(elem, walk, options);
break;
case 'ol':
result += format.orderedList(elem, walk, options);
break;
case 'pre':
var newOptions = Object.assign({}, options);
newOptions.isInPre = true;
result += format.paragraph(elem, walk, newOptions);
break;
case 'table':
result = containsTable(elem.attribs, options.tables)
? result + format.table(elem, walk, options)
: walk(elem.children || [], options, result);
break;
case 'blockquote':
result += format.blockquote(elem, walk, options);
break;
default:
result = walk(elem.children || [], options, result);
}
break;
case 'text':
if (elem.data !== '\r\n') {
// Text needs its leading space to be trimmed if `result`
// currently ends with whitespace
elem.trimLeadingSpace = whiteSpaceRegex.test(result);
result += format.text(elem, options);
}
break;
default:
if (!includes(SKIP_TYPES, elem.type)) {
result = walk(elem.children || [], options, result);
}
}
options.lineCharCount = result.length - (result.lastIndexOf('\n') + 1);
});
return result;
}
exports.fromString = function(str, options) {
return htmlToText(str, options || {});
};