Class Protein exports functions to parse specific text formats
class Protein {
constructor(sequence){
this.sequence = sequence;
this.hash = md5(sequence);
}
setUniprotData(uniprotData){
this.uniprotData = uniprotData;
}
retrieveUniprotData(accession){
let url = 'https://www.ebi.ac.uk/proteins/api/proteins/' + accession;
let self = this;
{
return new Promise((resolve, reject) => {
request
.get(url, (error, response, body) => {
if(error){
reject(error);
} else {
let protein = JSON.parse(body);
self.uniprotData = protein;
resolve(protein);
}
});
});
}
}
getUniprotData(){
return this.uniprotData;
}
getUniprotAccession(){
return this.uniprotData && this.uniprotData.accession
}
}
Option name | Type | Description |
---|---|---|
text | String | A string representing the FASTA input |
alphabet | Number | A string representing the alphabet to use for validation. Valid alphabets include "IUPAC", |
return | Promise | A promise that in it's |
Get Protein objects from Fasta string.
function fromFasta(text, alphabet){
if(typeof text !== 'string'){
throw "Passed invalid object to parse function."
}
else if(text.length < 1){
throw "Passed an empty string."
}
let sequences = [];
// this flag get's updated when I'm reading a sequence. No comments should appear when I'm reading a sequence (see switch).
let readingSequence = false;
let readingHeaders = false;
return new Promise((resolve, reject) => {
text
// Split line by line
.split("\n")
// Get rid of lines only containing spaces or tabs (or nothing)
.filter(s => s.replace(/[\s|\t]+/,'').length > 0)
// Perform switch on line output
.forEach((line) => {
switch(true){
// Marks beginning of sequence in common FASTA file
case /^>/.test(line):
// Comments can only appear in header. If ; appears while reading a sequence,
// am most likely starting to read a new protein which laks the usual > beginning.
// Be very strict about this condition.
case (/^;/.test(line) && readingSequence === true && readingHeaders === false):
// Case where ; sequence starts at beginning of file
case (/^;/.test(line) && readingSequence === false && readingHeaders === false):
sequences.push({
header: line.substring(1, line.length),
headerInfo: extractFASTAHeaderInfo(line),
sequence: '',
comments: ''
});
readingHeaders = true;
readingSequence = false;
break;
// Some sequences terminate in *. Get rid of that and update the reading sequence condition.
case FASTAEndReadParser(alphabet).test(line) && (
(readingSequence === false && readingHeaders === true) ||
(readingSequence === true && readingHeaders === false)
):
sequences[sequences.length-1].sequence += line.substring(1, line.length-1);
readingSequence = false;
break;
// If repetition of characters, most likely sequence
// IMPORTANT!!! ONLY CAPITAL LETTERS!!!!
case FASTABodyParser(alphabet).test(line) && (
(readingSequence === false && readingHeaders === true) ||
(readingSequence === true && readingHeaders === false)
):
sequences[sequences.length-1].sequence += line;
readingSequence = true;
readingHeaders = false;
break;
// If reading header and ; appears: it's a comment
case (/^;/.test(line) && readingSequence === false && readingHeaders === true):
sequences[sequences.length-1].comments += line.substring(1, line.length) + ' ';
break;
// Something weird happened!
default:
return reject("Could not parse one line of FASTA input:\n\n" + line);
}
});
let promises = [];
sequences.forEach(s => {
promises.push(new Promise((resolve, reject) => {
let uniprotAccession = s.headerInfo.find(p => p.database === "sp");
if(uniprotAccession){
fromAccession(uniprotAccession.identifier)
.then(([proteins, _]) => {
resolve(proteins[0]);
})
.catch(() => resolve(new Protein(s.sequence)));
} else {
resolve(new Protein(s.sequence));
}
})
);
});
return Promise.all(promises)
.then(proteins => resolve([proteins, sequences]))
.catch(e => reject(e));
});
}
Option name | Type | Description |
---|---|---|
accession | String | A string representing the UniProt accession number (eg.: P12345) |
return | Promise | A promise that in it's |
Get Protein object from Accession number (via UniProt).
function fromAccession(accession) {
if(!accessionNumberRegex.test(accession)){
return new Promise((resolve, reject) => {
return reject();
});
}
let url = 'https://www.ebi.ac.uk/proteins/api/proteins/' + accession;
{
return new Promise((resolve, reject) => {
request
.get(url, (error, response, body) => {
if(error){
return reject(error);
} else {
let protein = JSON.parse(body);
let p = new Protein(protein.sequence.sequence);
p.setUniprotData(protein);
return resolve([[p], [protein]]);
}
});
});
}
}
Option name | Type | Description |
---|---|---|
query | String | A string representing a protein name, a gene name, or anything UniProt-queriable |
limit | Number | A number representing the max amount of returned sequences by the query. If omitted, defaults to 2. Set to |
return | Promise | A promise that in it's |
Get Protein objects (via UniProt).
function fromUniprotQuery(query, limit=2) {
// Limit to two entries: if more than 1, you know: identifier is not unique!
let url = 'https://www.uniprot.org/uniprot/?format=fasta&limit=' + limit + '&query=' + query;
if(limit === undefined){
url = 'https://www.uniprot.org/uniprot/?format=fasta&query=' + query;
}
{
return new Promise((resolve, reject) => {
request
.get(url, (error, response, fastaProteins) => {
if(error){
return reject(error);
} else {
if(fastaProteins.length > 0){
return fromFasta(fastaProteins, alphabets.IUPAC2)
.then(result => {
resolve(result);
})
.catch(error => {
reject(error);
});
} else {
return reject("No sequences found");
}
}
});
});
}
}
Option name | Type | Description |
---|---|---|
sequence | String | A string representing a protein sequence (A-Z) |
alphabet | Number | A string representing the alphabet to use for validation. Valid alphabets include "IUPAC", |
return | Promise | A promise that in it's |
Get Protein object from A-Z sequence
function fromSequence(sequence, alphabet) {
return new Promise((resolve, reject) => {
let match = sequence.match(sequenceParser(alphabet));
if (match !== undefined && match !== null) {
match = match.map(e => e.replace(/\n/g,""));
return resolve([[new Protein(match[0])], [match[0]]]);
} else {
return reject('No sequence identified');
}
});
}
Option name | Type | Description |
---|---|---|
text | String | A string representing a FASTA sequence, an UniProt accession or a sequence in A-Z format |
alphabet | Number | A string representing the alphabet to use for validation. Valid alphabets include "IUPAC", |
return | Object | True if text can be parsed either as UniProt accession, AA sequence or FASTA file |
Get Protein object from Accession number (via UniProt).
function validInput(text, alphabet) {
switch(true){
case (accessionNumberRegex.test(text)):
return parsers.accession;
case sequenceParser(alphabet).test(text):
return parsers.aa;
case validFasta(text, alphabet):
return parsers.fasta;
case (uniprotNameRegex.test(text)):
return parsers.protein_name;
default:
return undefined;
}
}
Option name | Type | Description |
---|---|---|
text | String | A string representing a FASTA sequence, an UniProt accession or a sequence in A-Z format |
alphabet | Number | A string representing the alphabet to use for validation. Valid alphabets include "IUPAC", |
return | function | Returns the correct function to parse the text passed or |
Get Protein object from Accession number (via UniProt).
function autodetect(text, alphabet) {
switch(true){
case (accessionNumberRegex.test(text)):
return fromAccession;
case sequenceParser(alphabet).test(text):
// Return nested function, so that alphabet is defined at this stage already (avoid inconsistency!)
return (text) => fromSequence(text, alphabet);
case validFasta(text, alphabet):
// Return nested function, so that alphabet is defined at this stage already (avoid inconsistency!)
return (text) => fromFasta(text, alphabet);
case (uniprotNameRegex.test(text)):
return fromUniprotQuery;
default:
return undefined;
}
}
Collection of alphabets to be passed to parsing functions.
const alphabets = {
"PSI_BLAST" : 0,
"EXTENDED_IUPAC2": 1,
"IUPAC2": 2,
"IUPAC": 3,
"NATURAL": 4,
};
Collection of available parsers.
const parsers = {
fasta: 0,
aa: 1,
accession: 2,
protein_name: 3,
uniprot: 4
};
exports.Protein = Protein;
exports.fromFasta = fromFasta;
exports.fromAccession = fromAccession;
exports.fromUniprotQuery = fromUniprotQuery;
exports.fromSequence = fromSequence;
exports.validInput = validInput;
exports.autodetect = autodetect;
exports.alphabets = alphabets;
exports.parsers = parsers;