Nantes Université

Skip to content
Extraits de code Groupes Projets
Valider 02964c27 rédigé par Adrien Leger's avatar Adrien Leger
Parcourir les fichiers

update documentation

parent 8109cdb6
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
...@@ -89,7 +89,6 @@ class BlastHit(object): ...@@ -89,7 +89,6 @@ class BlastHit(object):
self.s_orient = "+" self.s_orient = "+"
self.s_start = int(s_start)-1 self.s_start = int(s_start)-1
self.s_end = int(s_end) self.s_end = int(s_end)
else: else:
self.s_orient = "-" self.s_orient = "-"
self.s_start = int(s_start) self.s_start = int(s_start)
......
...@@ -36,10 +36,12 @@ class Blastn(object): ...@@ -36,10 +36,12 @@ class Blastn(object):
dbtype="nucl", input_type="fasta"): dbtype="nucl", input_type="fasta"):
""" """
Create a blastdb from a reference fastq file Create a blastdb from a reference fastq file
@param makeblastdb_exec Path of the makeblastdb executable by default "makeblastdb" @param ref_path Path to the reference fasta file (not gzipped). Mandatory
@param makeblastdb_opt makeblastdb command line options as a string @param makeblastdb_exec Path of the makeblastdb executable. Default = "makeblastdb"
@param dbtype Molecule type of target db ('nucl', 'prot') @param makeblastdb_opt makeblastdb command line options as a string. Default = ""
@param input_type Type of the data specified in input_file ('asn1_bin', 'asn1_txt', 'blastdb', 'fasta') @param dbtype Molecule type of target db ('nucl', 'prot'). Default = "nucl"
@param input_type Type of the data specified in input_file ('asn1_bin', 'asn1_txt',
'blastdb', 'fasta'). Default = "fasta"
""" """
# Creating object variables # Creating object variables
self.ref_path = ref_path self.ref_path = ref_path
...@@ -56,7 +58,6 @@ class Blastn(object): ...@@ -56,7 +58,6 @@ class Blastn(object):
self.ref_path, self.db_path) self.ref_path, self.db_path)
print ("CREATE DATABASE: {}\n".format(cmd)) print ("CREATE DATABASE: {}\n".format(cmd))
# Run the command line without stdin and asking both stdout and stderr # Run the command line without stdin and asking both stdout and stderr
try: try:
# Execute the command line in the default shell # Execute the command line in the default shell
...@@ -73,7 +74,7 @@ class Blastn(object): ...@@ -73,7 +74,7 @@ class Blastn(object):
except Exception as E: except Exception as E:
print (E) print (E)
self._rm_db() self.rm_db()
self.db_dir = self.db_path = None self.db_dir = self.db_path = None
# Enter and exit are defined to use the with statement # Enter and exit are defined to use the with statement
...@@ -83,7 +84,7 @@ class Blastn(object): ...@@ -83,7 +84,7 @@ class Blastn(object):
def __exit__(self, type, value, traceback): def __exit__(self, type, value, traceback):
"""Destructor to remove the database and unziped fasta files if needed""" """Destructor to remove the database and unziped fasta files if needed"""
if self.db_dir: if self.db_dir:
self._rm_db() self.rm_db()
# Typical string methods # Typical string methods
def __str__(self): def __str__(self):
...@@ -104,13 +105,13 @@ class Blastn(object): ...@@ -104,13 +105,13 @@ class Blastn(object):
evalue=1, best_query_hit = False): evalue=1, best_query_hit = False):
""" """
Blast query against a subject database and return a list of BlastHit object Blast query against a subject database and return a list of BlastHit object
@param query_path Path to a fasta file containing the query sequences @param query_path Path to a fasta file containing the query sequences (not gzipped). Mandatory
@param blast_exec Path of the blast executable. By Default blastn will be used @param blast_exec Path of the blast executable. By Default blastn will be used. Default = "blastn"
@param blastn_opt Blastn command line options as a string @param blastn_opt Blastn command line options as a string. Default = ""
@param task Type of blast to be performed ('blastn' 'blastn-short' 'dc-megablast' @param task Type of blast to be performed ('blastn' 'blastn-short' 'dc-megablast'
'megablast' 'rmblastn'). By default "dc-megablast" 'megablast' 'rmblastn'). Default = "dc-megablast"
@param evalue E Value cuttoff to retain alignments @param evalue E Value cuttoff to retain alignments. Default = 1
@param best_query_hit find and return only the best hit per query @param best_query_hit find and return only the best hit per query. Default = False
@return A list of BlastHit objects if at least one hit was found @return A list of BlastHit objects if at least one hit was found
""" """
...@@ -118,8 +119,6 @@ class Blastn(object): ...@@ -118,8 +119,6 @@ class Blastn(object):
blast_exec, blastn_opt, cpu_count(), task, evalue, query_path, self.db_path) blast_exec, blastn_opt, cpu_count(), task, evalue, query_path, self.db_path)
print ("MAKE BLAST: {}\n".format(cmd)) print ("MAKE BLAST: {}\n".format(cmd))
# Execute the command line in the default shell # Execute the command line in the default shell
proc = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) proc = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate() stdout, stderr = proc.communicate()
...@@ -175,8 +174,6 @@ class Blastn(object): ...@@ -175,8 +174,6 @@ class Blastn(object):
print ("\t{} hits retained".format(len(hit_list))) print ("\t{} hits retained".format(len(hit_list)))
return hit_list return hit_list
#~~~~~~~PRIVATE METHODS~~~~~~~# def rm_db(self):
def _rm_db(self):
print ("Cleaning up blast DB files for \"{}\"\n".format(self.db_basename)) print ("Cleaning up blast DB files for \"{}\"\n".format(self.db_basename))
rmtree(self.db_dir) rmtree(self.db_dir)
# pyBlast 0.1 # pyBlast 0.1
**Simple Python 2.7/3.3 wrapper for BLAST+** **Simple and lightweight Python 2.7 wrapper module for NCBI BLAST+**
**Creation : 2015/05/18** **Creation : 2015/05/18**
**Last update : 2015/05/18** **Last update : 2015/05/22**
## MakeBlastDB ## BlastHit
Python object representing a hit found by blastn. The object contains the following public fields:
* id: Auto incremented unique identifier [INT]
* q_id: Query sequence name [STR]
* s_id: Subject sequence name [STR]
* identity: % of identity in the hit [FLOAT 0:100]
* length: length of the hit [INT >=0]
* mis: Number of mismatch in the hit [INT >=0]
* gap: Number of gap in the hit [INT >=0]
* q_start: Hit start position of the query sequence [INT >=0]
* q_end: Hit end position of the query sequence [INT >=0]
* s_start: Hit start position of the subject sequence [INT >=0]
* s_end: Hit end position of the subject sequence [INT >=0]
* evalue: E value of the alignment [FLOAT >=0]
* bscore: Bit score of the alignment[FLOAT >=0]
* q_seq: Sequence of the query aligned on the subject sequence [STR]
* q_orient: Orientation of the query sequence [+ or -]
* s_orient: Orientation of the subject sequence [+ or -]
The validity of numeric value is checked upon instantiation. Invalid values will raise assertion errors.
## Blastn
This class contain the wrapper for Blastn and require the installation of ncbi Blast+ 2.2.28+.
### Setup Blastn object: Create subject database
Upon instantiation, a database is created from the user-provided subject sequence. Database files are created in a temporary directory.
The following parameters can be customized at Blastn objects instantiation
* ref_path: Path to the reference fasta file (not gzipped). Mandatory
* makeblastdb_exec: Path of the makeblastdb executable. Default = "makeblastdb"
* makeblastdb_opt: makeblastdb command line options as a string. Default = ""
* dbtype: Molecule type of target db ('nucl', 'prot'). Default = "nucl"
* input_type: type of the data specified in input_file ('asn1_bin', 'asn1_txt', 'blastdb', 'fasta'). Default = "fasta"
To ensure a proper database files deletion at the end of the execution it is possible to call the object using the `with` statement.
Alternatively you can call the `rm_db` method at the end of the Blastn usage.
**Code**
```
with Blastn(ref_path="./subject.fa") as blastn:
print (blastn)
```
**Output**
```
CREATE DATABASE: makeblastdb -dbtype nucl -input_type fasta -in subject.fa -out /tmp/tmpihszgZ/subject
MAKEBLASTDB CLASS Parameters list
db_basename subject
db_dir /tmp/tmpihszgZ
db_path /tmp/tmpihszgZ/subject
dbtype nucl
input_type fasta
makeblastdb_exec makeblastdb
makeblastdb_opt
ref_path subject.fa
Cleaning up blast DB files for "subject"
```
... ### Calling Blastn object: Perform Blastn and return a list of hits
## MakeBlastn The Blastn object can then be directly called, as many time as desired, with query fasta files, that can contain several sequences.
The method will automatically call blastn in a multiprocessing fashion, using as many threads as possible.
The following parameters can be customized at Blastn objects calling:
... * query_path: Path to a fasta file containing the query sequences (not gzipped). Mandatory
* blast_exec: Path of the blast executable. By Default blastn will be used. Default = "blastn"
* blastn_opt: Blastn command line options as a string. Default = ""
* task: Type of blast to be performed ('blastn' 'blastn-short' 'dc-megablast' 'megablast' 'rmblastn'). Default = "dc-megablast"
* evalue: E Value cuttoff to retain alignments. Default = 1
* best_query_hit: find and return only the best hit per query. Default = False
## BlastHit A list containing 1 BlastHit object for each query hit found in the subject will be returned, except if not hit were found in which situation 'None' will be returned.
If the best_query_hit flag was set to True, Only the best hit per query sequence from the query file will be returned.
... **Code**
```
with Blastn(ref_path="./subject.fa") as blastn:
hit_list = blastn(query_path="./query.fa")
for hit in hit_list:
print (hit)
```
**Output**
```
CREATE DATABASE: makeblastdb -dbtype nucl -input_type fasta -in ./subject.fa -out /tmp/tmp1ZBlfT/subject
## test_pyBlast MAKE BLAST: blastn -num_threads 4 -task dc-megablast -evalue 1 -outfmt "6 std qseq" -dust no -query ./query.fa -db /tmp/tmp1ZBlfT/subject
* Install pytest 2 hits found
HIT 0 Query query1:0-48(+)
Subject subject:19-67(+)
Lenght : 48 Identity : 100.0% Evalue : 2e-23 Bit score : 87.8
Aligned query seq : GCATGCTCGATCAGTAGCTCTCAGTACGCATACGCTAGCATCACGACT
HIT 1 Query query2:0-48(+)
Subject subject:89-137(+)
Lenght : 48 Identity : 100.0% Evalue : 2e-23 Bit score : 87.8
Aligned query seq : CGCATCGACTCGATCTGATCAGCTCACAGTCAGCATCAGCTACGATCA
Cleaning up blast DB files for "subject"
```
## Testing pyBlast module
The module can be easily tested thanks to pytest
* Install pytest with pip `pip instal pytest`
* Run test with py.test-2.7 -v * Run test with py.test-2.7 -v
Example of output if succesfull. Please note than some tests might fail due to the random sampling of DNA sequences. Example of output if successful. Please note than some tests might fail due to the random sampling of DNA sequences, and uncertainties of Blastn algorithm.
``` ```
========================================== test session starts =========================================== ========================================== test session starts ===========================================
platform linux2 -- Python 2.7.5 -- py-1.4.27 -- pytest-2.7.0 -- /usr/bin/python platform linux2 -- Python 2.7.5 -- py-1.4.27 -- pytest-2.7.0 -- /usr/bin/python
...@@ -55,10 +151,10 @@ test_pyBlast.py::test_Blastn[rmblastn-Random queries] xfail ...@@ -55,10 +151,10 @@ test_pyBlast.py::test_Blastn[rmblastn-Random queries] xfail
================================== 6 passed, 15 xfailed in 5.91 seconds ================================== ================================== 6 passed, 15 xfailed in 5.91 seconds ==================================
``` ```
## Dependencies ## Dependencies
ncbi blast+
python package pytest for tests * [ncbi Blast+ 2.2.28+](http://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE_TYPE=BlastDocs&DOC_TYPE=Download)
* [python package pytest](http://pytest.org/latest/): `pip instal pytest`
## Authors and Contact ## Authors and Contact
...@@ -66,4 +162,4 @@ Adrien Leger - 2015 ...@@ -66,4 +162,4 @@ Adrien Leger - 2015
* <adrien.leger@gmail.com> - <adrien.leger@inserm.fr> - <adrien.leger@univ-nantes.fr> * <adrien.leger@gmail.com> - <adrien.leger@inserm.fr> - <adrien.leger@univ-nantes.fr>
* [Github](https://github.com/a-slide) * [Github](https://github.com/a-slide)
* [Atlantic Gene Therapies - INSERM 1089](http://www.atlantic-gene-therapies.fr/) * [Atlantic Gene Therapies - INSERM 1089](http://www.atlantic-gene-therapies.fr/)
\ No newline at end of file
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Veuillez vous inscrire ou vous pour commenter