pycaravel: This module is designed to simplify continuous integration (CI) of data from multiple projects.
Source code for caravel.parsers.parser_base
##########################################################################
# NSAp - Copyright (C) CEA, 2019
# Distributed under the terms of the CeCILL-B license, as published by
# the CEA-CNRS-INRIA. Refer to the LICENSE file or to
# http://www.cecill.info/licences/Licence_CeCILL-B_V1-en.html
# for details.
##########################################################################
"""
This module contains the generic parser definition.
"""
# System import
import datetime
import glob
import json
import os
import pickle
# Third party import
import pandas as pd
# Package import
from caravel.io import load
[docs]
class ParserBase:
""" Base parser to retrieve data from a BIDS directory.
"""
AVAILABLE_LAYOUTS = ("sourcedata", "rawdata", "derivatives", "phenotype")
def __init__(self, project, confdir, layoutdir):
""" Initialize the Caravel class.
Parameters
----------
project: str
the name of the project you are working on.
confdir: str
the locations of the configuration file of the current project.
layoutdir: str
the location of the pre-generated parsing representations. If None
switch to managers mode.
"""
self.project = project
self.layouts = {}
_conf = ParserBase._get_conf(confdir)
if project not in _conf:
raise ValueError(
f"Unknown configuration for project '{project}'. Available projects "
f"are: {_conf.keys()}.")
self.conf = _conf[project]
if layoutdir is not None:
_repr = self._get_repr(layoutdir)
if project not in _repr:
raise ValueError(
f"Unknown representation for project '{project}'. "
f"Available projects are: {_repr.keys()}.")
self.representation = _repr[project]
else:
self.representation = {"manager": [{"path": "to_be_created.pkl"}]}
self.connection = None
[docs]
def can_load(self):
""" A method checking the dataset type.
Returns
-------
out: bool
True if the dataset can be loaded, False otherwise.
"""
checks = [elem[-1]["path"] for elem in self.representation.values()]
if len(checks) == 0:
return False
return all(elem.endswith(self.EXT) for elem in checks)
def _check_layout(self, name):
""" Check if the layout name is supported.
"""
if name not in self.AVAILABLE_LAYOUTS:
raise ValueError(
f"Layout '{name}' is not yet supported. "
f"Available layouts are: {self.AVAILABLE_LAYOUTS}.")
@classmethod
def _get_conf(cls, confdir):
""" List all the configurations available and sort them by project.
"""
conf = {}
for path in glob.glob(os.path.join(confdir, "*.conf")):
basename = os.path.basename(path).replace(".conf", "")
project, name = basename.split("_")
if project not in conf:
conf[project] = {}
conf[project][name] = path
return conf
def _get_repr(self, layoutdir):
""" List all the layout representation available and sort them by
dates.
"""
representations = {}
layout_files = glob.glob(os.path.join(layoutdir, "*.pkl"))
for path in layout_files:
basename = os.path.basename(path).replace(".pkl", "")
project, name, timestamp = basename.split("_")
if project not in representations:
representations[project] = {}
representations[project].setdefault(name, []).append(
{"date": timestamp, "path": path})
for project_data in representations.values():
for name_data in project_data.values():
name_data.sort(key=lambda x: datetime.datetime.strptime(
x["date"], "%Y-%m-%d"))
return representations
def _check_conf(self, name):
""" Check if configuration is declared for the layout.
"""
if name not in self.conf:
raise ValueError(
"No configuration available for layout '{0}'. Please contact "
"the module developers to add the support for your project.")
def _load_layout(self, name):
""" Load a layout from its pre-generated representation.
"""
if name not in self.layouts:
if name not in self.representation:
raise ValueError(
f"A pre-generated '{name}' layout for your project "
f"'{self.project}' is expected in user mode. Please "
"contact the developers of the module.")
path = self.representation[name][-1]["path"]
with open(path, "rb") as open_file:
self.layouts[name] = pickle.load(open_file)
return self.layouts[name]
def _load_conf(self, name):
""" Load the configuration associated to a layout.
"""
if not isinstance(self.conf[name], dict):
with open(self.conf[name], "rt") as open_file:
self.conf[name] = json.load(open_file)
[docs]
def export_layout(self, name):
""" Export a layout as a pandas DataFrame.
Parameters
----------
name: str
the name of the layout.
Returns
-------
df: pandas DataFrame
the converted layout.
"""
raise NotImplementedError("This function has to be defined in child "
"child class.")
[docs]
def list_keys(self, name):
""" List all the filtering keys available in the layout.
Parameters
----------
name: str
the name of the layout.
Returns
-------
keys: list
the layout keys.
"""
raise NotImplementedError("This function has to be defined in child "
"child class.")
[docs]
def list_values(self, name, key):
""" List all the filtering key values available in the layout.
Parameters
----------
name: str
the name of the layout.
key: str
the name of key in the layout.
Returns
-------
values: list
the key associated values in the layout.
"""
raise NotImplementedError("This function has to be defined in child "
"child class.")
[docs]
def filter_layout(self, name, extensions=None, **kwargs):
""" Filter the layout by using a combination of key-values rules.
Parameters
----------
name: str
the name of the layout.
extensions: str or list of str
a filtering rule on the file extension.
kwargs: dict
the filtering options.
Returns
-------
df: pandas DataFrame
the filtered layout.
"""
raise NotImplementedError("This function has to be defined in child "
"child class.")
[docs]
def load_data(self, name, df, replace=None):
""" Load the data contained in the filename column of a pandas
DataFrame.
Note:
Only a couple of file extensions are supported. If no loader has been
found None is returned.
Parameters
----------
name: str
the name of the layout.
df: pandas DataFrame
a table with one 'filename' column.
replace: 2-uplet, default None
in the case of a CubicWeb resource, the data are downloaded in a
custom folder. Use this parameter to replace the server location
by your own location.
Returns
-------
data: dict
a dictionaray containing the loaded data.
"""
if "filename" not in df:
raise ValueError("One 'filename' column expected in your table.")
data = {}
for index, path in enumerate(df["filename"]):
if isinstance(path, dict):
_data = pd.DataFrame.from_records([path])
path = [f"{key}-{val}"
for key, val in zip(df.columns, df.to_numpy()[index])
if key != "filename"]
path = "_".join(path)
else:
if replace is not None:
path = path.replace(replace[0], replace[1])
try:
_data = load(path)
except Exception:
_data = None
if isinstance(_data, pd.DataFrame):
layout = self._load_layout(name)
file_obj = layout.files[path]
for ent_name, ent_val in file_obj.entities.items():
if ent_name in self.BASE_ENTITIES:
_data[ent_name] = ent_val
_data["dtype"] = name
if "participant_id" in _data:
_data["participant_id"] = _data[
"participant_id"].str.replace("sub-", "")
data[path] = _data
return data
Follow us