Source code for fragment_elaboration_scripts.zinc_data

"""
A very simple script to get info from Zinc.
Zinc, as far as I can tell, does not have an API,
so the HTML has to be scraped.

NB. I have not asked for permission to query ZINC programmatically

See ``ZincInformer`` for the class.

-------------
CLI Usage
-------------

.. code-block:: bash

    $ python -m zinc_data ZINC00000001 ZINC00000002 ZINC00000003 > zinc.json

or

.. code-block:: bash

    $ python -m zinc_data ZINC00000001 ZINC00000002 ZINC00000003 -o zinc.csv

-------------
Installation
-------------

No special installation requirements beyond ``pip install fragment_elaboration_scripts``
"""

import requests, collections, contextlib, json
from bs4 import BeautifulSoup



[docs]
class ZincInformer(collections.abc.MutableMapping):
    """
    A simple class to get Zinc info.

    A class that stores the retieved values —in ``.data`` (``.dump`` and ``.load`` to store).
    The values can be accessed as a subscript or by calling the instance,
    the latter captures errors declared during initialisation by the argument ``suppressed_exception``.

    The instance is callable, so can be used in a ``pandas.Series.apply``:

    .. code-block::python

        zinfo: Callable = ZincInformer()
        data: pd.DataFrame = series.apply(zinfo)

    The data is stored in a dictionary, so can be dumped to a JSON file and loaded back in.
    This is useful for caching without repeating requests.

    .. code-block::python

        zinfo: Callable = ZincInformer()
        zinfo.load('zinc.json')
        data: Dict = zinfo['ZINC00000001']
        print(zinfo.data)
        zinfo.dump('zinc.json')

    The data is fetched via a call by ``get_soup`` and then parsed by ``get_zinc_info``,
    which calls ``get_dl`` and ``polísh``.
    """


[docs]
    def __init__(self, suppressed_exception=Exception):
        self.data = {}
        self.suppressed_exception = suppressed_exception


    def __getitem__(self, zinc_id):
        if zinc_id not in self.data:
            soup = self.get_soup(zinc_id)
            self.data[zinc_id] = self.get_zinc_info(zinc_id, soup)
        return self.data[zinc_id]

    def __call__(self, zinc_id):
        with contextlib.suppress(self.suppressed_exception):
            return self[zinc_id]
        self[zinc_id] = {}
        return {}

    def __setitem__(self, zinc_id: str, info: dict):
        self.data[zinc_id] = info

    def __delitem__(self, zinc_id):
        del self.data[zinc_id]

    def __iter__(self):
        return iter(self.data)

    def __len__(self):
        return len(self.data)


[docs]
    def dump(self, filename: str = 'zinc.json'):
        with open(filename, 'w') as fh:
            json.dump(self.data, fh)



[docs]
    def load(self, filename: str = 'zinc.json'):
        with open(filename, 'r') as fh:
            self.data = json.load(fh)


    # ======== specific methods


[docs]
    @classmethod
    def get_soup(self, zinc_id: str) -> BeautifulSoup:
        """parse HTML. Return a soup"""
        response = requests.get(f'https://zinc.docking.org/substances/{zinc_id}')
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')



[docs]
    @classmethod
    def get_dl(self, soup: BeautifulSoup) -> dict:
        """
        Data tables are organised in dl entries, dt headers and dd fields.
        """
        keys, values = [], []
        for dl in soup.findAll("dl"):
            for dt in dl.findAll("dt"):
                keys.append(dt.text.strip())
            for dd in dl.findAll("dd"):
                values.append(dd.text.strip())
        return dict(zip(keys, values))



[docs]
    @classmethod
    def polísh(self, dl: dict) -> dict:
        """
        There's a bunch of words that get in the way...
        """
        return {k.replace('Bioactive', '').replace('Natural Products', '').replace('Building Blocks', '').strip(): v for
                k, v in dl.items()}



[docs]
    @classmethod
    def get_zinc_info(self, zinc_id, soup):
        """
        These fields ought to always exist!
        """
        return {'query_name': zinc_id,
                'title': soup.title.text.strip(),
                'SMILES': soup.find('input', dict(id="substance-smiles-field")).attrs['value'].strip(),
                'inchi': soup.find('input', dict(id="substance-inchi-field")).attrs['value'].strip(),
                'inchikey': soup.find('input', dict(id="substance-inchikey-field")).attrs['value'].strip(),
                **self.polísh(self.get_dl(soup))
                }




# ----------------- CLI ---------------------------------------------------------------------------------------------


[docs]
def main():
    import argparse
    import pandas as pd
    from typing import List, Dict

    parser = argparse.ArgumentParser(description='Get info from Zinc.')
    parser.add_argument('zinc_ids', nargs='+', help='Zinc IDs')
    parser.add_argument('-o', '--output', help='Output CSV file', default='')
    parser.add_argument('-c', '--cache', help='Saved cache file', default='')
    args = parser.parse_args()
    zinformer = ZincInformer()
    if args.cache:
        zinformer.load(args.cache)
    details: List[Dict] = []
    zinc_id: str
    for zinc_id in args.zinc_ids:
        info = {'id': zinc_id, **zinformer(zinc_id)}
        details.append(info)
    if args.cache:
        zinformer.dump(args.cache)
    if args.output:
        pd.DataFrame(details).to_csv(args.output, index=False)
    else:
        print(zinformer.data)


if __name__ == '__main__':
    main()
Source code for fragment_elaboration_scripts.zinc_data

fragment_elaboration_scripts

Navigation

Related Topics