124 lines
3.9 KiB
JavaScript
Raw Normal View History

2020-10-17 18:42:50 +02:00
'use strict';
const iconv = require('iconv-lite');
const encodingJapanese = require('encoding-japanese');
const charsets = require('./charsets');
/**
* Character set encoding and decoding functions
*/
const charset = (module.exports = {
/**
* Encodes an unicode string into an Buffer object as UTF-8
*
* We force UTF-8 here, no strange encodings allowed.
*
* @param {String} str String to be encoded
* @return {Buffer} UTF-8 encoded typed array
*/
encode(str) {
return Buffer.from(str, 'utf-8');
},
/**
* Decodes a string from Buffer to an unicode string using specified encoding
* NB! Throws if unknown charset is used
*
* @param {Buffer} buf Binary data to be decoded
* @param {String} [fromCharset='UTF-8'] Binary data is decoded into string using this charset
* @param {Function} [Iconv] node-iconv function
* @return {String} Decded string
*/
decode(buf, fromCharset, Iconv) {
fromCharset = charset.normalizeCharset(fromCharset || 'UTF-8');
if (/^(us-)?ascii|utf-8|7bit$/i.test(fromCharset)) {
return buf.toString('utf-8');
}
try {
if (typeof Iconv === 'function') {
let decoder = new Iconv(fromCharset, 'UTF-8');
return decoder.convert(buf).toString();
}
if (/^jis|^iso-?2022-?jp|^EUCJP/i.test(fromCharset)) {
if (typeof buf === 'string') {
buf = Buffer.from(buf);
}
try {
let output = encodingJapanese.convert(buf, {
to: 'UNICODE',
from: fromCharset,
type: 'string'
});
if (typeof output === 'string') {
output = Buffer.from(output);
}
return output;
} catch (err) {
// ignore, defaults to iconv-lite on error
}
}
return iconv.decode(buf, fromCharset);
} catch (err) {
// enforce utf-8, data loss might occur
return buf.toString();
}
},
/**
* Convert a string from specific encoding to UTF-8 Buffer
*
* @param {String|Buffer} str String to be encoded
* @param {String} [fromCharset='UTF-8'] Source encoding for the string
* @return {Buffer} UTF-8 encoded typed array
*/
convert(data, fromCharset, Iconv) {
fromCharset = charset.normalizeCharset(fromCharset || 'UTF-8');
let bufString;
if (typeof data !== 'string') {
if (/^(us-)?ascii|utf-8|7bit$/i.test(fromCharset)) {
return data;
}
bufString = charset.decode(data, fromCharset, Iconv);
return charset.encode(bufString);
}
return charset.encode(data);
},
/**
* Converts well known invalid character set names to proper names.
* eg. win-1257 will be converted to WINDOWS-1257
*
* @param {String} charset Charset name to convert
* @return {String} Canoninicalized charset name
*/
normalizeCharset(charset) {
charset = charset.toLowerCase().trim();
// first pass
if (charsets.hasOwnProperty(charset) && charsets[charset]) {
return charsets[charset];
}
charset = charset
.replace(/^utf[-_]?(\d+)/, 'utf-$1')
.replace(/^(?:us[-_]?)ascii/, 'windows-1252')
.replace(/^win(?:dows)?[-_]?(\d+)/, 'windows-$1')
.replace(/^(?:latin|iso[-_]?8859)?[-_]?(\d+)/, 'iso-8859-$1')
.replace(/^l[-_]?(\d+)/, 'iso-8859-$1');
// updated pass
if (charsets.hasOwnProperty(charset) && charsets[charset]) {
return charsets[charset];
}
return charset.toUpperCase();
}
});