Modifying values in .pdb file and save as new

87 views Asked by At

for an assignment I have to read a small .pdb file in Python, change some values, and save it as a new .pdb file.
My original file is like this:

ATOM     19  HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H  
ATOM     20  C   TYR     1      29.796  62.354  41.246  1.00  0.00           C  
ATOM     23  H   SER     2      30.611  61.950  39.410  1.00  0.00           H  
ATOM     24  CA  SER     2      30.082  64.035  39.354  1.00  0.00           C 
END 

I have tried with Pandas but with no success, as I cannot save it in the desired extension without it also saving an index, which I don't want (I used .to_csv('newfile.pdb')).
I have found a module called BioPython, and this is my attempt with it:

from Bio.PDB import PDBParser, PDBIO

def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

pdb_file = "pdb1.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
pdb_io.save(output_pdb)

This does what I want as far as changing the values but when I save it adds an unwanted line, like so:

ATOM     19  HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H  
ATOM     20  C   TYR     1      29.796  62.354  41.246  1.00  0.00           C  
ATOM     23  H   SER     2      36.111  61.950  39.410  1.00  0.00           H  
ATOM     24  CA  SER     2      35.582  64.035  39.354  1.00  0.00           C  
TER      33      SER     2
END

How can I save it without that last line?

Thank you for your help!

2

There are 2 answers

1
Umar On BEST ANSWER

I have dded a new function save_without_ter_end to save the modified structure without TER and END lines. The save_without_ter_end function constructs each line of the PDB file manually, ensuring that only ATOM records are written and appending the END line at the end. I removed the usage of PDBIO since it didn't provide the desired result of excluding TER and END lines.

from Bio.PDB import PDBParser, PDBIO

def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

def save_without_ter_end(structure, filename):
    with open(filename, 'w') as pdb_file:
        for model in structure:
            for chain in model:
                for residue in chain:
                    for atom in residue:
                        atom_line = f"ATOM  {atom.serial_number:<5d} {atom.get_name():>4s} {residue.resname:>3s} {chain.id}{residue.id[1]:>4d}    {atom.coord[0]:8.3f}{atom.coord[1]:8.3f}{atom.coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f}          {atom.element:>2s}\n"
                        pdb_file.write(atom_line)
        pdb_file.write("END\n")


pdb_file = "pdb1.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file without TER and END lines
output_pdb = "modified_pdb1.pdb"
save_without_ter_end(structure, output_pdb)


ATOM  19     HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H
ATOM  20       C TYR     1      29.796  62.354  41.246  1.00  0.00           C
ATOM  23       H SER     2      36.111  61.950  39.410  1.00  0.00           H
ATOM  24      CA SER     2      35.582  64.035  39.354  1.00  0.00           C
END

or simply add this in your command. This code iterates over each atom in the structure and manually writes the PDB file without including the TER and END line.

# Manually write the PDB file
with open(output_pdb, 'w') as pdb_file:
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_line = f"{atom.get_id():6s}{atom.get_name():5s}{residue.resname:>4s} {chain.id:4s}{residue.id[1]:>4d}    {atom.coord[0]:8.3f}{atom.coord[1]:8.3f}{atom.coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f}          {atom.element:>2s}\n"
                    pdb_file.write(atom_line)

or u can acheive same by following

import pandas as pd
from Bio.PDB import PDBParser, PDBIO
# Read the original PDB file into a DataFrame
pdb_file = "pdb1.pdb"
columns = ["record_name", "atom_number", "atom_name", "residue_name", "chain_id", "residue_number", "x", "y", "z", "occupancy", "b_factor", "element_symbol", "charge"]
df = pd.read_csv(pdb_file, delim_whitespace=True, names=columns)

def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

# Read the original PDB file
pdb_file = "pdb1.pdb"
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Define translation vector
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)


with open(output_pdb, 'w') as pdb_file:
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_line = f"ATOM  {atom.serial_number:>4d} {atom.element:>2s} {residue.resname:>4s} {chain.id:4s}{residue.id[1]:>4d}    {atom.coord[0]:8.3f}{atom.coord[1]:8.3f}{atom.coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f}          {atom.element:>2s}\n"
                    pdb_file.write(atom_line)
    pdb_file.write("END\n")

print("Modified PDB file saved successfully.")

or

from Bio.PDB import PDBParser, PDBIO
import pandas as pd

# Read the original PDB file
pdb_file = "pdb1.pdb"
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Create an empty DataFrame to store the atom information
atom_data = []

# Iterate over the structure and populate the DataFrame with atom information
for model in structure:
    for chain in model:
        for residue in chain:
            for atom in residue:
                atom_data.append([
                    atom.serial_number,
                    atom.element,
                    residue.resname,
                    chain.id,
                    residue.id[1],
                    atom.coord[0],
                    atom.coord[1],
                    atom.coord[2],
                    atom.occupancy,
                    atom.bfactor,
                    atom.element
                ])

# Create a DataFrame from the collected atom data
columns = ["atom_number", "atom_name", "residue_name", "chain_id", "residue_number", "x", "y", "z", "occupancy", "b_factor", "charge"]
df = pd.DataFrame(atom_data, columns=columns)
# Define translation vector
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
ser_mask = df["residue_name"] == "SER"
df.loc[ser_mask, ["x", "y", "z"]] += translation_vector

# Save the modified DataFrame to a new PDB file
output_pdb = "modified_pdb1.pdb"
output_pdb = "modified_pdb1.pdb"
# Save the DataFrame to a new PDB file
output_pdb = "modified_pdb1.pdb"
with open(output_pdb, 'w') as pdb_file:
    for index, row in df.iterrows():
        atom_line = f"ATOM  {row['atom_number']:>4d} {row['atom_name']:>2s} {row['residue_name']:>4s} {row['chain_id']:4s}{row['residue_number']:>4d}    {row['x']:8.3f}{row['y']:8.3f}{row['z']:8.3f}{row['occupancy']:6.2f}{row['b_factor']:6.2f}          {row['charge']:>2s}\n"
        pdb_file.write(atom_line)
    pdb_file.write("END\n")

print("Modified PDB file saved successfully.")
0
pippo1980 On

Ok my attempt , using your input


from Bio.PDB import PDBParser, PDBIO

import Bio

print('Biopython version : ', Bio.__version__)


def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

pdb_file = "input.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
pdb_io.save(output_pdb , write_end = False , preserve_atom_numbering = True)

will give :

ATOM     19  HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H  
ATOM     20  C   TYR     1      29.796  62.354  41.246  1.00  0.00           C  
ATOM     23  H   SER     2      36.111  61.950  39.410  1.00  0.00           H  
ATOM     24  CA  SER     2      35.582  64.035  39.354  1.00  0.00           C  
TER      25      SER     2                                                       

thanks to write_end = False in pdb_io.save(output_pdb , write_end = False , preserve_atom_numbering = True)

see Documentation class Bio.PDB.PDBIO.PDBIO(use_model_flag=0)

Longer code :


from Bio.PDB import PDBParser, PDBIO, Select

import Bio

from Bio.PDB.PDBExceptions import PDBIOException

print('Biopython version : ', Bio.__version__)

class PDBIO(PDBIO):
    
    def save(self, file, select = Select(), write_end=True, preserve_atom_numbering=False):
        """Save structure to a file.

        :param file: output file
        :type file: string or filehandle

        :param select: selects which entities will be written.
        :type select: object

        Typically select is a subclass of L{Select}, it should
        have the following methods:

         - accept_model(model)
         - accept_chain(chain)
         - accept_residue(residue)
         - accept_atom(atom)

        These methods should return 1 if the entity is to be
        written out, 0 otherwise.

        Typically select is a subclass of L{Select}.
        """
        if isinstance(file, str):
            fhandle = open(file, "w")
        else:
            # filehandle, I hope :-)
            fhandle = file

        get_atom_line = self._get_atom_line

        # multiple models?
        if len(self.structure) > 1 or self.use_model_flag:
            model_flag = 1
        else:
            model_flag = 0

        for model in self.structure.get_list():
            if not select.accept_model(model):
                continue
            # necessary for ENDMDL
            # do not write ENDMDL if no residues were written
            # for this model
            model_residues_written = 0
            if not preserve_atom_numbering:
                atom_number = 1
            if model_flag:
                fhandle.write(f"MODEL      {model.serial_num}\n")

            for chain in model.get_list():
                if not select.accept_chain(chain):
                    continue
                chain_id = chain.id
                if len(chain_id) > 1:
                    e = f"Chain id ('{chain_id}') exceeds PDB format limit."
                    raise PDBIOException(e)

                # necessary for TER
                # do not write TER if no residues were written
                # for this chain
                chain_residues_written = 0

                for residue in chain.get_unpacked_list():
                    if not select.accept_residue(residue):
                        continue
                    hetfield, resseq, icode = residue.id
                    resname = residue.resname
                    segid = residue.segid
                    resid = residue.id[1]
                    if resid > 9999:
                        e = f"Residue number ('{resid}') exceeds PDB format limit."
                        raise PDBIOException(e)

                    for atom in residue.get_unpacked_list():
                        if not select.accept_atom(atom):
                            continue
                        chain_residues_written = 1
                        model_residues_written = 1
                        if preserve_atom_numbering:
                            atom_number = atom.serial_number

                        try:
                            s = get_atom_line(
                                atom,
                                hetfield,
                                segid,
                                atom_number,
                                resname,
                                resseq,
                                icode,
                                chain_id,
                            )
                        except Exception as err:
                            # catch and re-raise with more information
                            raise PDBIOException(
                                f"Error when writing atom {atom.full_id}"
                            ) from err
                        else:
                            fhandle.write(s)
                            # inconsequential if preserve_atom_numbering is True
                            atom_number += 1

                # if chain_residues_written:
                #     fhandle.write(
                #         _TER_FORMAT_STRING
                #         % (atom_number, resname, chain_id, resseq, icode)
                #     )

            if model_flag and model_residues_written:
                fhandle.write("ENDMDL\n")
        if write_end:
            fhandle.write("END   \n")

        if isinstance(file, str):
            fhandle.close()


def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

pdb_file = "input.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
pdb_io.save(output_pdb , write_end = False ,, preserve_atom_numbering = True)

will give :

ATOM     19  HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H  
ATOM     20  C   TYR     1      29.796  62.354  41.246  1.00  0.00           C  
ATOM     23  H   SER     2      36.111  61.950  39.410  1.00  0.00           H  
ATOM     24  CA  SER     2      35.582  64.035  39.354  1.00  0.00           C  

because I monkey patched PDBIO class PDBIO(StructureIO) deleting :

# if chain_residues_written:
#     fhandle.write(
#         _TER_FORMAT_STRING
#         % (atom_number, resname, chain_id, resseq, icode)
#     )

need to import : from Bio.PDB.PDBExceptions import PDBIOException though

A side note:

your original code is incorrect missing : preserve_atom_numbering = True

in pdb_io.save(output_pdb) at least for last stable Biopython