Source code for fragment_elaboration_scripts.zinc_data
"""
A very simple script to get info from Zinc.
Zinc, as far as I can tell, does not have an API,
so the HTML has to be scraped.
NB. I have not asked for permission to query ZINC programmatically
See ``ZincInformer`` for the class.
-------------
CLI Usage
-------------
.. code-block:: bash
$ python -m zinc_data ZINC00000001 ZINC00000002 ZINC00000003 > zinc.json
or
.. code-block:: bash
$ python -m zinc_data ZINC00000001 ZINC00000002 ZINC00000003 -o zinc.csv
-------------
Installation
-------------
No special installation requirements beyond ``pip install fragment_elaboration_scripts``
"""
import requests, collections, contextlib, json
from bs4 import BeautifulSoup
[docs]
class ZincInformer(collections.abc.MutableMapping):
"""
A simple class to get Zinc info.
A class that stores the retieved values —in ``.data`` (``.dump`` and ``.load`` to store).
The values can be accessed as a subscript or by calling the instance,
the latter captures errors declared during initialisation by the argument ``suppressed_exception``.
The instance is callable, so can be used in a ``pandas.Series.apply``:
.. code-block::python
zinfo: Callable = ZincInformer()
data: pd.DataFrame = series.apply(zinfo)
The data is stored in a dictionary, so can be dumped to a JSON file and loaded back in.
This is useful for caching without repeating requests.
.. code-block::python
zinfo: Callable = ZincInformer()
zinfo.load('zinc.json')
data: Dict = zinfo['ZINC00000001']
print(zinfo.data)
zinfo.dump('zinc.json')
The data is fetched via a call by ``get_soup`` and then parsed by ``get_zinc_info``,
which calls ``get_dl`` and ``polísh``.
"""
[docs]
def __init__(self, suppressed_exception=Exception):
self.data = {}
self.suppressed_exception = suppressed_exception
def __getitem__(self, zinc_id):
if zinc_id not in self.data:
soup = self.get_soup(zinc_id)
self.data[zinc_id] = self.get_zinc_info(zinc_id, soup)
return self.data[zinc_id]
def __call__(self, zinc_id):
with contextlib.suppress(self.suppressed_exception):
return self[zinc_id]
self[zinc_id] = {}
return {}
def __setitem__(self, zinc_id: str, info: dict):
self.data[zinc_id] = info
def __delitem__(self, zinc_id):
del self.data[zinc_id]
def __iter__(self):
return iter(self.data)
def __len__(self):
return len(self.data)
[docs]
def dump(self, filename: str = 'zinc.json'):
with open(filename, 'w') as fh:
json.dump(self.data, fh)
[docs]
def load(self, filename: str = 'zinc.json'):
with open(filename, 'r') as fh:
self.data = json.load(fh)
# ======== specific methods
[docs]
@classmethod
def get_soup(self, zinc_id: str) -> BeautifulSoup:
"""parse HTML. Return a soup"""
response = requests.get(f'https://zinc.docking.org/substances/{zinc_id}')
response.raise_for_status()
return BeautifulSoup(response.text, 'html.parser')
[docs]
@classmethod
def get_dl(self, soup: BeautifulSoup) -> dict:
"""
Data tables are organised in dl entries, dt headers and dd fields.
"""
keys, values = [], []
for dl in soup.findAll("dl"):
for dt in dl.findAll("dt"):
keys.append(dt.text.strip())
for dd in dl.findAll("dd"):
values.append(dd.text.strip())
return dict(zip(keys, values))
[docs]
@classmethod
def polísh(self, dl: dict) -> dict:
"""
There's a bunch of words that get in the way...
"""
return {k.replace('Bioactive', '').replace('Natural Products', '').replace('Building Blocks', '').strip(): v for
k, v in dl.items()}
[docs]
@classmethod
def get_zinc_info(self, zinc_id, soup):
"""
These fields ought to always exist!
"""
return {'query_name': zinc_id,
'title': soup.title.text.strip(),
'SMILES': soup.find('input', dict(id="substance-smiles-field")).attrs['value'].strip(),
'inchi': soup.find('input', dict(id="substance-inchi-field")).attrs['value'].strip(),
'inchikey': soup.find('input', dict(id="substance-inchikey-field")).attrs['value'].strip(),
**self.polísh(self.get_dl(soup))
}
# ----------------- CLI ---------------------------------------------------------------------------------------------
[docs]
def main():
import argparse
import pandas as pd
from typing import List, Dict
parser = argparse.ArgumentParser(description='Get info from Zinc.')
parser.add_argument('zinc_ids', nargs='+', help='Zinc IDs')
parser.add_argument('-o', '--output', help='Output CSV file', default='')
parser.add_argument('-c', '--cache', help='Saved cache file', default='')
args = parser.parse_args()
zinformer = ZincInformer()
if args.cache:
zinformer.load(args.cache)
details: List[Dict] = []
zinc_id: str
for zinc_id in args.zinc_ids:
info = {'id': zinc_id, **zinformer(zinc_id)}
details.append(info)
if args.cache:
zinformer.dump(args.cache)
if args.output:
pd.DataFrame(details).to_csv(args.output, index=False)
else:
print(zinformer.data)
if __name__ == '__main__':
main()