class Parser {
constructor() {
this.defaultConf = {
results: {
flat: [
['langcode', 'Language code'],
['langname', 'Language name'],
['error', 'Error reason'],
]
},
parsecodes: {
200: 1,
},
results_format: '$query: $langcode ($langname)\\n',
HTML_TextExtractor_ua: 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
HTML_TextExtractor_use_proxy: false,
max_text_size: 4096,
text_position: 'last',
lang_list: {
"af":"Afrikaans","am":"Amharic","ar":"Arabic","az":"Azerbaijani","ba":"Bashkir","be":"Belarusian","bg":"Bulgarian","bn":"Bengali","bs":"Bosnian","ca":"Catalan","ceb":"Cebuano","cs":"Czech","cy":"Welsh","da":"Danish","de":"German","el":"Greek","en":"English","eo":"Esperanto","es":"Spanish","et":"Estonian","eu":"Basque","fa":"Persian","fi":"Finnish","fr":"French","ga":"Irish","gd":"Scottish Gaelic","gl":"Galician","gu":"Gujarati","he":"Hebrew","hi":"Hindi","hr":"Croatian","ht":"Haitian","hu":"Hungarian","hy":"Armenian","id":"Indonesian","is":"Icelandic","it":"Italian","ja":"Japanese","jv":"Javanese","ka":"Georgian","kk":"Kazakh","kn":"Kannada","ko":"Korean","ky":"Kyrgyz","la":"Latin","lb":"Luxembourgish","lt":"Lithuanian","lv":"Latvian","mg":"Malagasy","mhr":"Mari","mi":"Maori","mk":"Macedonian","ml":"Malayalam","mn":"Mongolian","mr":"Marathi","mrj":"Hill Mari","ms":"Malay","mt":"Maltese","ne":"Nepali","nl":"Dutch","no":"Norwegian","pa":"Punjabi","pap":"Papiamento","pl":"Polish","pt":"Portuguese","ro":"Romanian","ru":"Russian","si":"Sinhalese","sk":"Slovak","sl":"Slovenian","sq":"Albanian","sr":"Serbian","su":"Sundanese","sv":"Swedish","sw":"Swahili","ta":"Tamil","te":"Telugu","tg":"Tajik","th":"Thai","tl":"Tagalog","tr":"Turkish","tt":"Tatar","udm":"Udmurt","uk":"Ukrainian","ur":"Urdu","uz":"Uzbek","vi":"Vietnamese","xh":"Xhosa","yi":"Yiddish","zh":"Chinese"
},
};
this.editableConf = [
['HTML_TextExtractor_ua', ['textfield', 'User agent']],
['max_text_size', ['combobox', 'Max text size for detect',
[1024, '1 Kb'],
[2048, '2 Kb'],
[4096, '4 Kb'],
[8192, '8 Kb']
]],
['text_position', ['combobox', 'Text position for detect',
['first', 'First'],
['last', 'Last']
]],
['HTML_TextExtractor_use_proxy', ['combobox', 'Use proxy in TextExtractor',
[true, 'Yes'],
[false, 'No']
]],
];
}
*parse(set, results) {
this.logger.put("Open " + set.query);
let response_get_text = yield this.parser.request('HTML::TextExtractor', 'default', {
goodCode: {
200: 1,
},
'user-agent': this.conf.HTML_TextExtractor_ua,
useproxy: this.conf.HTML_TextExtractor_use_proxy,
}, set.query);
let texts = response_get_text.texts;
if (response_get_text.info.success && texts.length > 0) {
this.logger.put("Received " + texts.length + " text blocks");
let text = '';
if (this.conf.text_position == 'last') {
text = _.unescape(texts.join(" ").replace(/\s+/g, ' ').replace(/<.+?>/g, '')).slice(this.conf.max_text_size * -1);
this.logger.put("Cutted last " + text.length + " chars");
} else {
text = _.unescape(texts.join(" ").replace(/\s+/g, ' ').replace(/<.+?>/g, '')).slice(0, this.conf.max_text_size);
this.logger.put("Cutted first " + text.length + " chars");
};
this.logger.put("Open https://translate.yandex.ru");
let response_get_id = yield this.request('GET', 'https://translate.yandex.ru', {}, {
check_content: ['<\/html>'],
decode: 'auto-html',
});
if (response_get_id) {
let rev_id = response_get_id.data.match(/SID:\s*'([^']+)/)[1].split('.');
let id = [];
rev_id.forEach(function(part, i, rev_id) {
id.push(part.split("").reverse().join(""));
});
this.logger.put("Received ID successful");
this.logger.put("Detect language");
let response_get_lang = yield this.request('POST', 'https://translate.yandex.net/api/v1/tr.json/detect?sid=' + id.join('.') + '&srv=tr-text', {
text: text,
}, {
check_content: ['"code":200'],
decode: 'auto-html',
});
if (response_get_lang.success) {
results.langcode = JSON.parse(response_get_lang.data).lang;
results.langname = this.conf.lang_list[results.langcode];
results.error = "";
results.success = 1;
} else {
if (response_get_lang.code == 414) {
this.logger.put("Error: Text too large. Try reduce Max text size");
results.error = "Text too large, language detect failed";
} else {
this.logger.put("Error detecting language");
results.error = "Detecting language failed";
}
results.success = 0;
}
} else {
this.logger.put("Error opening https://translate.yandex.ru");
results.error = "Getting ID failed";
results.success = 0;
}
} else {
if (texts.length == 0) {
this.logger.put("Error opening " + set.query + ": No text");
results.error = "No text";
} else {
this.logger.put("Error opening " + set.query);
results.error = "Other reason";
}
results.success = 0;
}
return results;
}
}