diff --git a/pdbtools/pdb_merge.py b/pdbtools/pdb_merge.py index 3659df8..dd83546 100644 --- a/pdbtools/pdb_merge.py +++ b/pdbtools/pdb_merge.py @@ -18,8 +18,18 @@ """ Merges several PDB files into one. -The contents are not sorted and no lines are deleted (e.g. END, TER -statements) so we recommend piping the results through `pdb_tidy.py`. +Use `pdb_mkensemble` if you with to make an ensemble of multiple +conformation states of the same protein. + +Follows the criteria: + + * The merged PDB file will represent a single MODEL. + * Non-coordinate lines in input PDBs will be ignored. + * Atom numbers are restarted from 1. + * CONECT lines are yield at the end. CONECT numbers are updated to + the new atom numbers. + * Missing TER and END statements are placed accordingly. Original + TER and END statements are maintained. Usage: python pdb_merge.py @@ -41,6 +51,13 @@ __email__ = "joaomcteixeira@gmail.com" +# Python 2.7 compatibility +try: + FileNotFoundError +except NameError: + FileNotFoundError = IOError + + def check_input(args): """Checks whether to read from stdin/file and validates user input/options. """ @@ -48,16 +65,20 @@ def check_input(args): # Defaults fl = [] # file list - if len(args) >= 1: + if len(args) == 1: + sys.stderr.write('ERROR!! Please provide more than one input file.') + sys.stderr.write(__doc__) + sys.exit(1) + + elif len(args) >= 1: for fn in args: if not os.path.isfile(fn): - emsg = 'ERROR!! File not found or not readable: \'{}\'\n' + emsg = 'ERROR!! File not found or not readable: \'{}\'' sys.stderr.write(emsg.format(fn)) sys.stderr.write(__doc__) sys.exit(1) - fh = open(fn, 'r') - fl.append(fh) + fl.append(fn) else: # Whatever ... sys.stderr.write(__doc__) @@ -66,25 +87,151 @@ def check_input(args): return fl -def run(flist): +# TER 606 LEU A 75 +_fmt_TER = "TER {:>5d} {:3s} {:1s}{:>4s}{:1s}" + " " * 53 + os.linesep + + +def make_TER(prev_line, fmt_TER=_fmt_TER): + """Creates a TER statement based on the last ATOM/HETATM line.""" + # Add last TER statement + serial = int(prev_line[6:11]) + 1 + rname = prev_line[17:20] + chain = prev_line[21] + resid = prev_line[22:26] + icode = prev_line[26] + + return fmt_TER.format(serial, rname, chain, resid, icode) + + +def _get_lines_from_input(pinput, i=0): + """Decide wheter input is file or lines.""" + try: + return open(pinput, 'r') + except (FileNotFoundError, TypeError): + return pinput + + +def _update_atom_number(line, number, anisou=('ANISOU',)): + if line.startswith(anisou): + number -= 1 + return line[:6] + str(number).rjust(5) + line[11:] + + +def run(input_list): """ - Iterate over a list of files and yields each line sequentially. + Merges PDB files into a single file. + + Follows the criteria: + + * The merged PDB file will represent a single MODEL. + * Non-coordinate lines will be ignored. + * Atom numbers are restarted from 1. + * CONECT lines are yield at the end. CONECT numbers are updated + to the new atom numbers. + * TER and END statements are placed accordingly. + + Use `pdb_mkensemble` if you with to make an ensemble of multiple + conformation states of the same protein. Parameters ---------- - flist : list of file-like objects - Must handle `.close()` attribute. + input_list : iterator of iterators + `input_list` can be: + * a list of file paths + * a list of file handlers + * a list of lists of lines, the latter representing the + content of the different input PDB files Yields ------ str (line-by-line) - Lines from the concatenated PDB files. + Lines from the merged PDB files. """ + records = ('ATOM', 'HETATM', 'ANISOU', 'CONECT', 'MODEL', 'ENDMDL') + atom_anisou = ('ATOM', 'ANISOU') + atom_hetatm = ('ATOM', 'HETATM') + hetatm = ('HETATM',) + conect = ('CONECT',) + prev_chain = None + chain = None + prev_line = '' + conect_lines = [] + + # CONECT logic taken from pdb_preatom + fmt_CONECT = "CONECT{:>5s}{:>5s}{:>5s}{:>5s}{:>5s}" + " " * 49 + os.linesep + char_ranges = ( + slice(6, 11), + slice(11, 16), + slice(16, 21), + slice(21, 26), + slice(26, 31), + ) + atom_number = 1 + + for input_item in input_list: + + lines = _get_lines_from_input(input_item) + + # store for CONECT statements + # restart at each PDB. Read docs above + serial_equiv = {'': ''} + + for line in lines: + + if not line.startswith(records): + continue + + chain = line[21] + + if line.startswith(atom_hetatm): + serial_equiv[line[6:11].strip()] = atom_number + + if \ + line.startswith(hetatm) \ + and prev_line.startswith(atom_anisou): + + yield _update_atom_number(make_TER(prev_line), atom_number) + atom_number += 1 + + elif \ + prev_chain is not None \ + and chain != prev_chain \ + and prev_line.startswith(atom_anisou): + + yield _update_atom_number(make_TER(prev_line), atom_number) + atom_number += 1 + + elif line.startswith(conect): + + # 6:11, 11:16, 16:21, 21:26, 26:31 + serials = (line[cr].strip() for cr in char_ranges) + + # If not found, return default + new_serials = (str(serial_equiv.get(s, s)) for s in serials) + conect_line = fmt_CONECT.format(*new_serials) + + conect_lines.append(conect_line) + + continue + + elif not line.strip(os.linesep).strip(): + continue + + yield _update_atom_number(line, atom_number) + atom_number += 1 + + prev_line = line + prev_chain = chain + + try: + lines.close() + except AttributeError: + pass + + for line in conect_lines: + yield line - for fhandle in flist: - for line in fhandle: - yield line - fhandle.close() + yield 'END' + os.linesep concatenate_files = run diff --git a/tests/data/dummy_merge_A.pdb b/tests/data/dummy_merge_A.pdb new file mode 100644 index 0000000..918cd1b --- /dev/null +++ b/tests/data/dummy_merge_A.pdb @@ -0,0 +1,15 @@ +HEADER CHAIN A FOR pdb_merge +ATOM 1 N ASN A 1 22.066 40.557 0.420 1.00 0.00 N +ATOM 2 CA BASN A 1 20.000 30.000 0.005 0.60 0.00 C +ATOM 3 CA AASN A 1 21.411 39.311 0.054 0.40 0.00 C +ATOM 4 C ASN A 1 22.143 38.629 -1.102 1.00 0.00 C +ATOM 5 O ASN A 1 21.581 38.297 -2.176 1.00 0.00 O +ATOM 6 N ARG A 2 23.408 38.395 -0.829 1.00 0.00 N +ATOM 7 CA ARG A 2 24.384 37.823 -1.757 1.00 0.00 C +ATOM 8 C ARG A 2 24.061 36.421 -2.189 1.00 0.00 C +ATOM 9 O ARG A 2 24.411 36.106 -3.325 1.00 0.00 O +ATOM 10 N GLU A 3 23.456 35.570 -1.408 1.00 0.00 N +ATOM 11 CA GLU A 3 23.064 34.219 -1.780 1.00 0.00 C +ATOM 12 C GLU A 3 21.682 34.241 -2.380 1.00 0.00 C +ATOM 13 O GLU A 3 21.239 33.190 -2.786 1.00 0.00 O +TER 17 HOH A 6 diff --git a/tests/data/dummy_merge_B.pdb b/tests/data/dummy_merge_B.pdb new file mode 100644 index 0000000..2888354 --- /dev/null +++ b/tests/data/dummy_merge_B.pdb @@ -0,0 +1,14 @@ +HEADER CHAIN B FOR pdb_merge +ATOM 1 N ARG B 1 36.898 42.175 -2.688 1.00 0.00 N +ATOM 2 CA ARG B 1 37.080 43.455 -3.421 1.00 0.00 C +ATOM 3 C ARG B 1 36.102 44.524 -2.998 1.00 0.00 C +ATOM 4 O ARG B 1 36.577 45.677 -2.879 1.00 0.00 O +ATOM 5 N GLU B 2 34.849 44.167 -2.710 1.00 0.00 N +ATOM 6 CA GLU B 2 33.861 45.127 -2.233 1.00 0.00 C +ATOM 7 C GLU B 2 34.180 45.629 -0.820 1.00 0.00 C +ATOM 8 O GLU B 2 33.914 46.775 -0.464 1.00 0.00 O +ATOM 9 N ALA B 3 34.725 44.679 -0.087 1.00 0.00 N +ATOM 10 CA ALA B 3 35.081 45.036 1.305 1.00 0.00 C +ATOM 11 C ALA B 3 36.213 46.067 1.258 1.00 0.00 C +ATOM 12 O ALA B 3 36.287 47.046 2.028 1.00 0.00 O +HETATM 13 O HOH B 4 11.052 -12.419 29.700 1.00 73.70 diff --git a/tests/data/dummy_merge_C.pdb b/tests/data/dummy_merge_C.pdb new file mode 100644 index 0000000..25b1a25 --- /dev/null +++ b/tests/data/dummy_merge_C.pdb @@ -0,0 +1,21 @@ +ATOM 1 N ARG C 1 36.898 42.175 -2.688 1.00 0.00 N +ATOM 2 CA ARG C 1 37.080 43.455 -3.421 1.00 0.00 C +ATOM 3 C ARG C 1 36.102 44.524 -2.998 1.00 0.00 C +ATOM 4 O ARG C 1 36.577 45.677 -2.879 1.00 0.00 O +ATOM 5 N GLU C 2 34.849 44.167 -2.710 1.00 0.00 N +ATOM 6 CA GLU C 2 33.861 45.127 -2.233 1.00 0.00 C +ATOM 7 C GLU C 2 34.180 45.629 -0.820 1.00 0.00 C +ATOM 8 O GLU C 2 33.914 46.775 -0.464 1.00 0.00 O +ATOM 9 N MET C 3 43.010 -16.998 71.911 1.00 54.34 +ATOM 10 CA MET C 3 42.850 -16.494 70.506 1.00 52.98 +ATOM 11 C MET C 3 41.752 -17.205 69.684 1.00 52.05 +ATOM 12 O MET C 3 41.560 -18.418 69.777 1.00 54.00 +TER 13 MET C 3 +HETATM 14 O HOH C 4 -8.172 -22.003 57.197 1.00 70.53 +HETATM 15 O HOH C 5 36.020 -23.583 73.186 1.00 24.82 +HETATM 16 O HOH C 6 41.203 -28.852 57.698 1.00 53.16 +HETATM 17 O HOH C 7 -4.491 -9.687 56.752 1.00 55.08 +HETATM 18 O HOH C 8 24.561 0.532 70.565 1.00 44.77 +CONECT 16 17 +CONECT 1 2 4 5 + diff --git a/tests/data/dummy_merged.pdb b/tests/data/dummy_merged.pdb new file mode 100644 index 0000000..8cd4777 --- /dev/null +++ b/tests/data/dummy_merged.pdb @@ -0,0 +1,49 @@ +ATOM 1 N ASN A 1 22.066 40.557 0.420 1.00 0.00 N +ATOM 2 CA BASN A 1 20.000 30.000 0.005 0.60 0.00 C +ATOM 3 CA AASN A 1 21.411 39.311 0.054 0.40 0.00 C +ATOM 4 C ASN A 1 22.143 38.629 -1.102 1.00 0.00 C +ATOM 5 O ASN A 1 21.581 38.297 -2.176 1.00 0.00 O +ATOM 6 N ARG A 2 23.408 38.395 -0.829 1.00 0.00 N +ATOM 7 CA ARG A 2 24.384 37.823 -1.757 1.00 0.00 C +ATOM 8 C ARG A 2 24.061 36.421 -2.189 1.00 0.00 C +ATOM 9 O ARG A 2 24.411 36.106 -3.325 1.00 0.00 O +ATOM 10 N GLU A 3 23.456 35.570 -1.408 1.00 0.00 N +ATOM 11 CA GLU A 3 23.064 34.219 -1.780 1.00 0.00 C +ATOM 12 C GLU A 3 21.682 34.241 -2.380 1.00 0.00 C +ATOM 13 O GLU A 3 21.239 33.190 -2.786 1.00 0.00 O +TER 14 GLU A 3 +ATOM 15 N ARG B 1 36.898 42.175 -2.688 1.00 0.00 N +ATOM 16 CA ARG B 1 37.080 43.455 -3.421 1.00 0.00 C +ATOM 17 C ARG B 1 36.102 44.524 -2.998 1.00 0.00 C +ATOM 18 O ARG B 1 36.577 45.677 -2.879 1.00 0.00 O +ATOM 19 N GLU B 2 34.849 44.167 -2.710 1.00 0.00 N +ATOM 20 CA GLU B 2 33.861 45.127 -2.233 1.00 0.00 C +ATOM 21 C GLU B 2 34.180 45.629 -0.820 1.00 0.00 C +ATOM 22 O GLU B 2 33.914 46.775 -0.464 1.00 0.00 O +ATOM 23 N ALA B 3 34.725 44.679 -0.087 1.00 0.00 N +ATOM 24 CA ALA B 3 35.081 45.036 1.305 1.00 0.00 C +ATOM 25 C ALA B 3 36.213 46.067 1.258 1.00 0.00 C +ATOM 26 O ALA B 3 36.287 47.046 2.028 1.00 0.00 O +TER 27 ALA B 3 +HETATM 28 O HOH B 4 11.052 -12.419 29.700 1.00 73.70 +ATOM 29 N ARG C 1 36.898 42.175 -2.688 1.00 0.00 N +ATOM 30 CA ARG C 1 37.080 43.455 -3.421 1.00 0.00 C +ATOM 31 C ARG C 1 36.102 44.524 -2.998 1.00 0.00 C +ATOM 32 O ARG C 1 36.577 45.677 -2.879 1.00 0.00 O +ATOM 33 N GLU C 2 34.849 44.167 -2.710 1.00 0.00 N +ATOM 34 CA GLU C 2 33.861 45.127 -2.233 1.00 0.00 C +ATOM 35 C GLU C 2 34.180 45.629 -0.820 1.00 0.00 C +ATOM 36 O GLU C 2 33.914 46.775 -0.464 1.00 0.00 O +ATOM 37 N MET C 3 43.010 -16.998 71.911 1.00 54.34 +ATOM 38 CA MET C 3 42.850 -16.494 70.506 1.00 52.98 +ATOM 39 C MET C 3 41.752 -17.205 69.684 1.00 52.05 +ATOM 40 O MET C 3 41.560 -18.418 69.777 1.00 54.00 +TER 41 MET C 3 +HETATM 42 O HOH C 4 -8.172 -22.003 57.197 1.00 70.53 +HETATM 43 O HOH C 5 36.020 -23.583 73.186 1.00 24.82 +HETATM 44 O HOH C 6 41.203 -28.852 57.698 1.00 53.16 +HETATM 45 O HOH C 7 -4.491 -9.687 56.752 1.00 55.08 +HETATM 46 O HOH C 8 24.561 0.532 70.565 1.00 44.77 +CONECT 44 45 +CONECT 29 30 32 33 +END diff --git a/tests/test_pdb_merge.py b/tests/test_pdb_merge.py index cfd16e0..23c471f 100644 --- a/tests/test_pdb_merge.py +++ b/tests/test_pdb_merge.py @@ -54,7 +54,7 @@ def exec_module(self): return def test_default(self): - """$ pdb_merge data/dummy.pdb""" + """$ pdb_merge data/dummy.pdb data/dummy.pdb""" # Simulate input sys.argv = ['', os.path.join(data_dir, 'dummy.pdb'), @@ -65,15 +65,67 @@ def test_default(self): # Validate results self.assertEqual(self.retcode, 0) # ensure the program exited OK. - self.assertEqual(len(self.stdout), 408) # no lines deleted + self.assertEqual(len(self.stdout), 383) # deleted non-record self.assertEqual(len(self.stderr), 0) # no errors + def test_merge_three_chains(self): + """$ pdb_merge data/dummy_merge_A.pdb data/dummy_merge_B.pdb data/dummy_merge_C.pdb""" + # Simulate input + sys.argv = [ + '', + os.path.join(data_dir, 'dummy_merge_A.pdb'), + os.path.join(data_dir, 'dummy_merge_B.pdb'), + os.path.join(data_dir, 'dummy_merge_C.pdb'), + ] + + result_file = os.path.join(data_dir, 'dummy_merged.pdb') + + # Execute the script + self.exec_module() + + # Validate results + self.assertEqual(self.retcode, 0) # ensure the program exited OK. + self.assertEqual(len(self.stderr), 0) # no errors + + with open(result_file, 'r') as fin: + expected_lines = [l.strip(os.linesep) for l in fin.readlines()] + + self.assertEqual(self.stdout, expected_lines) + + def test_merge_three_chains_library(self): + """ + Use pdb_merge as library. + + + """ + # Simulate input + fA = os.path.join(data_dir, 'dummy_merge_A.pdb') + fB = os.path.join(data_dir, 'dummy_merge_B.pdb') + fC = os.path.join(data_dir, 'dummy_merge_C.pdb') + + with open(fB, 'r') as fin: + fB_lines = list(fin.readlines()) + + input_data = [fA, fB_lines, fC] + + result_file = os.path.join(data_dir, 'dummy_merged.pdb') + + from pdbtools.pdb_merge import run + result = [line.strip(os.linesep) for line in run(input_data)] + + with open(result_file, 'r') as fin: + expected_lines = [l.strip(os.linesep) for l in fin.readlines()] + + self.assertEqual(result, expected_lines) + + def test_file_not_found(self): """$ pdb_merge not_existing.pdb""" # Error (file not found) - afile = os.path.join(data_dir, 'not_existing.pdb') - sys.argv = ['', afile] + afile = os.path.join(data_dir, 'dummy.pdb') + bfile = os.path.join(data_dir, 'not_existing.pdb') + sys.argv = ['', afile, bfile] # Execute the script self.exec_module() @@ -83,6 +135,22 @@ def test_file_not_found(self): self.assertEqual(self.stderr[0][:22], "ERROR!! File not found") # proper error message + def test_error_single_file(self): + """$ pdb_merge dummy.pdb""" + + # Error (file not found) + afile = os.path.join(data_dir, 'dummy.pdb') + sys.argv = ['', afile] + + # Execute the script + self.exec_module() + + self.assertEqual(self.retcode, 1) # exit code is 1 (error) + self.assertEqual(len(self.stdout), 0) # nothing written to stdout + self.assertEqual(self.stderr[0][:48], + 'ERROR!! Please provide more than one input file.' + ) + @unittest.skipIf(os.getenv('SKIP_TTY_TESTS'), 'skip on GHA - no TTY') def test_helptext(self): """$ pdb_merge"""