124 lines
3.9 KiB
JavaScript
124 lines
3.9 KiB
JavaScript
|
'use strict';
|
||
|
|
||
|
const iconv = require('iconv-lite');
|
||
|
const encodingJapanese = require('encoding-japanese');
|
||
|
const charsets = require('./charsets');
|
||
|
|
||
|
/**
|
||
|
* Character set encoding and decoding functions
|
||
|
*/
|
||
|
const charset = (module.exports = {
|
||
|
/**
|
||
|
* Encodes an unicode string into an Buffer object as UTF-8
|
||
|
*
|
||
|
* We force UTF-8 here, no strange encodings allowed.
|
||
|
*
|
||
|
* @param {String} str String to be encoded
|
||
|
* @return {Buffer} UTF-8 encoded typed array
|
||
|
*/
|
||
|
encode(str) {
|
||
|
return Buffer.from(str, 'utf-8');
|
||
|
},
|
||
|
|
||
|
/**
|
||
|
* Decodes a string from Buffer to an unicode string using specified encoding
|
||
|
* NB! Throws if unknown charset is used
|
||
|
*
|
||
|
* @param {Buffer} buf Binary data to be decoded
|
||
|
* @param {String} [fromCharset='UTF-8'] Binary data is decoded into string using this charset
|
||
|
* @param {Function} [Iconv] node-iconv function
|
||
|
* @return {String} Decded string
|
||
|
*/
|
||
|
decode(buf, fromCharset, Iconv) {
|
||
|
fromCharset = charset.normalizeCharset(fromCharset || 'UTF-8');
|
||
|
|
||
|
if (/^(us-)?ascii|utf-8|7bit$/i.test(fromCharset)) {
|
||
|
return buf.toString('utf-8');
|
||
|
}
|
||
|
|
||
|
try {
|
||
|
if (typeof Iconv === 'function') {
|
||
|
let decoder = new Iconv(fromCharset, 'UTF-8');
|
||
|
return decoder.convert(buf).toString();
|
||
|
}
|
||
|
|
||
|
if (/^jis|^iso-?2022-?jp|^EUCJP/i.test(fromCharset)) {
|
||
|
if (typeof buf === 'string') {
|
||
|
buf = Buffer.from(buf);
|
||
|
}
|
||
|
try {
|
||
|
let output = encodingJapanese.convert(buf, {
|
||
|
to: 'UNICODE',
|
||
|
from: fromCharset,
|
||
|
type: 'string'
|
||
|
});
|
||
|
if (typeof output === 'string') {
|
||
|
output = Buffer.from(output);
|
||
|
}
|
||
|
return output;
|
||
|
} catch (err) {
|
||
|
// ignore, defaults to iconv-lite on error
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return iconv.decode(buf, fromCharset);
|
||
|
} catch (err) {
|
||
|
// enforce utf-8, data loss might occur
|
||
|
return buf.toString();
|
||
|
}
|
||
|
},
|
||
|
|
||
|
/**
|
||
|
* Convert a string from specific encoding to UTF-8 Buffer
|
||
|
*
|
||
|
* @param {String|Buffer} str String to be encoded
|
||
|
* @param {String} [fromCharset='UTF-8'] Source encoding for the string
|
||
|
* @return {Buffer} UTF-8 encoded typed array
|
||
|
*/
|
||
|
convert(data, fromCharset, Iconv) {
|
||
|
fromCharset = charset.normalizeCharset(fromCharset || 'UTF-8');
|
||
|
|
||
|
let bufString;
|
||
|
|
||
|
if (typeof data !== 'string') {
|
||
|
if (/^(us-)?ascii|utf-8|7bit$/i.test(fromCharset)) {
|
||
|
return data;
|
||
|
}
|
||
|
|
||
|
bufString = charset.decode(data, fromCharset, Iconv);
|
||
|
return charset.encode(bufString);
|
||
|
}
|
||
|
return charset.encode(data);
|
||
|
},
|
||
|
|
||
|
/**
|
||
|
* Converts well known invalid character set names to proper names.
|
||
|
* eg. win-1257 will be converted to WINDOWS-1257
|
||
|
*
|
||
|
* @param {String} charset Charset name to convert
|
||
|
* @return {String} Canoninicalized charset name
|
||
|
*/
|
||
|
normalizeCharset(charset) {
|
||
|
charset = charset.toLowerCase().trim();
|
||
|
|
||
|
// first pass
|
||
|
if (charsets.hasOwnProperty(charset) && charsets[charset]) {
|
||
|
return charsets[charset];
|
||
|
}
|
||
|
|
||
|
charset = charset
|
||
|
.replace(/^utf[-_]?(\d+)/, 'utf-$1')
|
||
|
.replace(/^(?:us[-_]?)ascii/, 'windows-1252')
|
||
|
.replace(/^win(?:dows)?[-_]?(\d+)/, 'windows-$1')
|
||
|
.replace(/^(?:latin|iso[-_]?8859)?[-_]?(\d+)/, 'iso-8859-$1')
|
||
|
.replace(/^l[-_]?(\d+)/, 'iso-8859-$1');
|
||
|
|
||
|
// updated pass
|
||
|
if (charsets.hasOwnProperty(charset) && charsets[charset]) {
|
||
|
return charsets[charset];
|
||
|
}
|
||
|
|
||
|
return charset.toUpperCase();
|
||
|
}
|
||
|
});
|