Skip to content

TGA

pydatalab.apps.tga special

blocks

MassSpecBlock (DataBlock)

Source code in pydatalab/apps/tga/blocks.py
class MassSpecBlock(DataBlock):
    blocktype = "ms"
    name = "Mass spectrometry"
    description = "Read and visualize mass spectrometry data as a grid plot per channel"
    accepted_file_extensions = (".asc", ".txt")

    @property
    def plot_functions(self):
        return (self.generate_ms_plot,)

    def generate_ms_plot(self):
        file_info = None
        # all_files = None
        ms_data = None

        if "file_id" not in self.data:
            LOGGER.warning("No file set in the DataBlock")
            return
        else:
            file_info = get_file_info_by_id(self.data["file_id"], update_if_live=True)
            ext = os.path.splitext(file_info["location"].split("/")[-1])[-1].lower()
            if ext not in self.accepted_file_extensions:
                LOGGER.warning(
                    "Unsupported file extension (must be one of %s, not %s)",
                    self.accepted_file_extensions,
                    ext,
                )
                return

            ms_data = parse_mt_mass_spec_ascii(Path(file_info["location"]))
            if ms_data:
                self.data["bokeh_plot_data"] = self._plot_ms_data(ms_data)

    @classmethod
    def _plot_ms_data(cls, ms_data):
        x_options = ["Time Relative [s]"]

        # collect the maximum value of the data key for each species for plot ordering
        max_vals: List[Tuple[str, float]] = []

        data_key: str = (
            "Partial pressure [mbar] or Ion Current [A]"  # default value for data key if missing
        )

        for species in ms_data["data"]:
            data_key = (
                "Partial Pressure [mbar]"
                if "Partial Pressure [mbar]" in ms_data["data"][species]
                else "Ion Current [A]"
            )
            data = ms_data["data"][species][data_key].to_numpy()

            ms_data["data"][species][f"{data_key} (Savitzky-Golay)"] = savgol_filter(
                data, len(data) // 10, 3
            )

            max_vals.append((species, ms_data["data"][species][data_key].max()))

        plots = []
        for ind, (species, _) in enumerate(sorted(max_vals, key=lambda x: x[1], reverse=True)):
            plots.append(
                selectable_axes_plot(
                    {species: ms_data["data"][species]},
                    x_options=x_options,
                    y_options=[data_key],
                    y_default=[
                        f"{data_key} (Savitzky-Golay)",
                        f"{data_key}",
                    ],
                    label_x=(ind == 0),
                    label_y=(ind == 0),
                    plot_line=True,
                    plot_points=False,
                    plot_title=f"Channel name: {species}",
                    plot_index=ind,
                    aspect_ratio=1.5,
                )
            )

            plots[-1].children[0].xaxis[0].ticker.desired_num_ticks = 2

            # construct MxN grid of all species
            M = 3
            grid = []
            for i in range(0, len(plots), M):
                grid.append(plots[i : i + M])
            p = gridplot(grid, sizing_mode="scale_width", toolbar_location="below")

            return bokeh.embed.json_item(p, theme=DATALAB_BOKEH_GRID_THEME)
accepted_file_extensions: tuple[str, ...] | None

A list of file extensions that the block will attempt to read.

blocktype: str

A short (unique) string key specifying the type of block.

description: str

A longer description outlining the purpose and capability of the block.

name: str

The human-readable block name specifying which technique or file format it pertains to.

plot_functions property readonly
generate_ms_plot(self)
Source code in pydatalab/apps/tga/blocks.py
def generate_ms_plot(self):
    file_info = None
    # all_files = None
    ms_data = None

    if "file_id" not in self.data:
        LOGGER.warning("No file set in the DataBlock")
        return
    else:
        file_info = get_file_info_by_id(self.data["file_id"], update_if_live=True)
        ext = os.path.splitext(file_info["location"].split("/")[-1])[-1].lower()
        if ext not in self.accepted_file_extensions:
            LOGGER.warning(
                "Unsupported file extension (must be one of %s, not %s)",
                self.accepted_file_extensions,
                ext,
            )
            return

        ms_data = parse_mt_mass_spec_ascii(Path(file_info["location"]))
        if ms_data:
            self.data["bokeh_plot_data"] = self._plot_ms_data(ms_data)

parsers

parse_mt_mass_spec_ascii(path: Path) -> Dict[str, Union[pandas.core.frame.DataFrame, Dict]]

Parses an .asc file containing MS results from a Mettler-Toledo spectrometer and returns a dictionary with keys data and meta, which themselves contain a dictionary of dataframes for each species with the species names/masses as keys, and a dictionary of metadata fields respectively.

Parameters:

Name Type Description Default
path Path

The path of the file to parse.

required
Source code in pydatalab/apps/tga/parsers.py
def parse_mt_mass_spec_ascii(path: Path) -> Dict[str, Union[pd.DataFrame, Dict]]:
    """Parses an .asc file containing MS results from a Mettler-Toledo
    spectrometer and returns a dictionary with keys `data` and `meta`,
    which themselves contain a dictionary of dataframes for each species
    with the species names/masses as keys, and a dictionary of
    metadata fields respectively.

    Parameters:
        path: The path of the file to parse.

    """

    header_keys = ("Sourcefile", "Exporttime", "Start Time", "End Time")
    data_keys = ("Time Relative [s]", "Partial Pressure [mbar]", "Ion Current [A]")
    header = {}
    species = []

    if not path.exists():
        raise RuntimeError(f"Provided path does not exist: {path!r}")

    with open(path) as f:
        # Read start of file until all header keys have been found
        max_header_lines = 8
        reads = 0
        header_end = None
        while reads < max_header_lines:
            line = f.readline().strip()
            reads += 1
            if line:
                for key in header_keys:
                    if key in line:
                        header[key] = line.split(key)[-1].strip()
            if all(k in header for k in header_keys):
                header_end = f.tell()
                break
        else:
            raise ValueError(
                f"Could not find all header keys in first {max_header_lines} lines of file."
            )

        for key in header_keys[1:]:
            if "time" in key.lower():
                header[key] = dateutil.parser.parse(header[key])  # type: ignore

        reads = 0
        max_species_lines = 10
        while reads < max_species_lines:
            line = f.readline().strip()
            reads += 1
            if not line:
                continue
            species = line.split()
            break
        else:
            raise ValueError(
                f"Could not find species list in lines {header_end}:{header_end + max_species_lines} lines of file."
            )

        # Read data with duplicated keys: will have (column number % number of data keys) appended to them
        # MT software also writes "---" if the value is missing, so parse these as NaNs to remove later
        df = pd.read_csv(f, sep="\t", header=0, parse_dates=False, na_values=["---"])
        ms_results: Dict[str, Union[pd.DataFrame, Dict]] = {}
        ms_results["meta"] = header
        ms_results["data"] = {}

        # Some files have Ion Current [A] or Partial Pressure [mbar] -- only rename those that are present
        present_keys = set(df.columns.values) & set(data_keys)
        for ind, specie in enumerate(species):
            # Loop over all species and rename the columns to remove the species name and disaggregate as a dict
            species_data_keys = [k + f"{'.' + str(ind) if ind != 0 else ''}" for k in present_keys]
            ms_results["data"][specie] = df[species_data_keys].rename(
                {mangled: original for mangled, original in zip(species_data_keys, present_keys)},
                axis="columns",
            )

            # Drop time axis as format cannot be easily inferred and data is essentially duplicated: "Start Time" in header
            # provides the timestamp of the first row
            ms_results["data"][specie].drop("Time", axis="columns", inplace=True, errors="ignore")

            # If the file was provided in an incomplete form, the final rows will be NaN, so drop them
            ms_results["data"][specie].dropna(inplace=True)

        return ms_results