TGA
pydatalab.apps.tga
special
¶
blocks
¶
MassSpecBlock (DataBlock)
¶
Source code in pydatalab/apps/tga/blocks.py
class MassSpecBlock(DataBlock):
blocktype = "ms"
name = "Mass spectrometry"
description = "Read and visualize mass spectrometry data as a grid plot per channel"
accepted_file_extensions = (".asc", ".txt")
@property
def plot_functions(self):
return (self.generate_ms_plot,)
def generate_ms_plot(self):
file_info = None
# all_files = None
ms_data = None
if "file_id" not in self.data:
LOGGER.warning("No file set in the DataBlock")
return
else:
file_info = get_file_info_by_id(self.data["file_id"], update_if_live=True)
ext = os.path.splitext(file_info["location"].split("/")[-1])[-1].lower()
if ext not in self.accepted_file_extensions:
LOGGER.warning(
"Unsupported file extension (must be one of %s, not %s)",
self.accepted_file_extensions,
ext,
)
return
ms_data = parse_mt_mass_spec_ascii(Path(file_info["location"]))
if ms_data:
self.data["bokeh_plot_data"] = self._plot_ms_data(ms_data)
@classmethod
def _plot_ms_data(cls, ms_data):
x_options = ["Time Relative [s]"]
# collect the maximum value of the data key for each species for plot ordering
max_vals: List[Tuple[str, float]] = []
data_key: str = (
"Partial pressure [mbar] or Ion Current [A]" # default value for data key if missing
)
for species in ms_data["data"]:
data_key = (
"Partial Pressure [mbar]"
if "Partial Pressure [mbar]" in ms_data["data"][species]
else "Ion Current [A]"
)
data = ms_data["data"][species][data_key].to_numpy()
ms_data["data"][species][f"{data_key} (Savitzky-Golay)"] = savgol_filter(
data, len(data) // 10, 3
)
max_vals.append((species, ms_data["data"][species][data_key].max()))
plots = []
for ind, (species, _) in enumerate(sorted(max_vals, key=lambda x: x[1], reverse=True)):
plots.append(
selectable_axes_plot(
{species: ms_data["data"][species]},
x_options=x_options,
y_options=[data_key],
y_default=[
f"{data_key} (Savitzky-Golay)",
f"{data_key}",
],
label_x=(ind == 0),
label_y=(ind == 0),
plot_line=True,
plot_points=False,
plot_title=f"Channel name: {species}",
plot_index=ind,
aspect_ratio=1.5,
)
)
plots[-1].children[0].xaxis[0].ticker.desired_num_ticks = 2
# construct MxN grid of all species
M = 3
grid = []
for i in range(0, len(plots), M):
grid.append(plots[i : i + M])
p = gridplot(grid, sizing_mode="scale_width", toolbar_location="below")
return bokeh.embed.json_item(p, theme=DATALAB_BOKEH_GRID_THEME)
accepted_file_extensions: tuple[str, ...] | None
¶
A list of file extensions that the block will attempt to read.
blocktype: str
¶
A short (unique) string key specifying the type of block.
description: str
¶
A longer description outlining the purpose and capability of the block.
name: str
¶
The human-readable block name specifying which technique or file format it pertains to.
plot_functions
property
readonly
¶
generate_ms_plot(self)
¶
Source code in pydatalab/apps/tga/blocks.py
def generate_ms_plot(self):
file_info = None
# all_files = None
ms_data = None
if "file_id" not in self.data:
LOGGER.warning("No file set in the DataBlock")
return
else:
file_info = get_file_info_by_id(self.data["file_id"], update_if_live=True)
ext = os.path.splitext(file_info["location"].split("/")[-1])[-1].lower()
if ext not in self.accepted_file_extensions:
LOGGER.warning(
"Unsupported file extension (must be one of %s, not %s)",
self.accepted_file_extensions,
ext,
)
return
ms_data = parse_mt_mass_spec_ascii(Path(file_info["location"]))
if ms_data:
self.data["bokeh_plot_data"] = self._plot_ms_data(ms_data)
parsers
¶
parse_mt_mass_spec_ascii(path: Path) -> Dict[str, Union[pandas.core.frame.DataFrame, Dict]]
¶
Parses an .asc file containing MS results from a Mettler-Toledo
spectrometer and returns a dictionary with keys data
and meta
,
which themselves contain a dictionary of dataframes for each species
with the species names/masses as keys, and a dictionary of
metadata fields respectively.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
Path |
The path of the file to parse. |
required |
Source code in pydatalab/apps/tga/parsers.py
def parse_mt_mass_spec_ascii(path: Path) -> Dict[str, Union[pd.DataFrame, Dict]]:
"""Parses an .asc file containing MS results from a Mettler-Toledo
spectrometer and returns a dictionary with keys `data` and `meta`,
which themselves contain a dictionary of dataframes for each species
with the species names/masses as keys, and a dictionary of
metadata fields respectively.
Parameters:
path: The path of the file to parse.
"""
header_keys = ("Sourcefile", "Exporttime", "Start Time", "End Time")
data_keys = ("Time Relative [s]", "Partial Pressure [mbar]", "Ion Current [A]")
header = {}
species = []
if not path.exists():
raise RuntimeError(f"Provided path does not exist: {path!r}")
with open(path) as f:
# Read start of file until all header keys have been found
max_header_lines = 8
reads = 0
header_end = None
while reads < max_header_lines:
line = f.readline().strip()
reads += 1
if line:
for key in header_keys:
if key in line:
header[key] = line.split(key)[-1].strip()
if all(k in header for k in header_keys):
header_end = f.tell()
break
else:
raise ValueError(
f"Could not find all header keys in first {max_header_lines} lines of file."
)
for key in header_keys[1:]:
if "time" in key.lower():
header[key] = dateutil.parser.parse(header[key]) # type: ignore
reads = 0
max_species_lines = 10
while reads < max_species_lines:
line = f.readline().strip()
reads += 1
if not line:
continue
species = line.split()
break
else:
raise ValueError(
f"Could not find species list in lines {header_end}:{header_end + max_species_lines} lines of file."
)
# Read data with duplicated keys: will have (column number % number of data keys) appended to them
# MT software also writes "---" if the value is missing, so parse these as NaNs to remove later
df = pd.read_csv(f, sep="\t", header=0, parse_dates=False, na_values=["---"])
ms_results: Dict[str, Union[pd.DataFrame, Dict]] = {}
ms_results["meta"] = header
ms_results["data"] = {}
# Some files have Ion Current [A] or Partial Pressure [mbar] -- only rename those that are present
present_keys = set(df.columns.values) & set(data_keys)
for ind, specie in enumerate(species):
# Loop over all species and rename the columns to remove the species name and disaggregate as a dict
species_data_keys = [k + f"{'.' + str(ind) if ind != 0 else ''}" for k in present_keys]
ms_results["data"][specie] = df[species_data_keys].rename(
{mangled: original for mangled, original in zip(species_data_keys, present_keys)},
axis="columns",
)
# Drop time axis as format cannot be easily inferred and data is essentially duplicated: "Start Time" in header
# provides the timestamp of the first row
ms_results["data"][specie].drop("Time", axis="columns", inplace=True, errors="ignore")
# If the file was provided in an incomplete form, the final rows will be NaN, so drop them
ms_results["data"][specie].dropna(inplace=True)
return ms_results