Skip to content

TGA

pydatalab.apps.tga special

blocks

MassSpecBlock (DataBlock)

Source code in pydatalab/apps/tga/blocks.py
class MassSpecBlock(DataBlock):
    blocktype = "ms"
    description = "Mass spectrometry (MS)"
    accepted_file_extensions = (".asc",)

    @property
    def plot_functions(self):
        return (self.generate_ms_plot,)

    def generate_ms_plot(self):
        file_info = None
        # all_files = None
        ms_data = None

        if "file_id" not in self.data:
            LOGGER.warning("No file set in the DataBlock")
            return
        else:
            file_info = get_file_info_by_id(self.data["file_id"], update_if_live=True)
            ext = os.path.splitext(file_info["location"].split("/")[-1])[-1].lower()
            if ext not in self.accepted_file_extensions:
                LOGGER.warning(
                    "Unsupported file extension (must be one of %s, not %s)",
                    self.accepted_file_extensions,
                    ext,
                )
                return

            ms_data = parse_mt_mass_spec_ascii(Path(file_info["location"]))

        x_options = ["Time Relative [s]"]

        if ms_data:
            # collect the maximum value of the data key for each species for plot ordering
            max_vals: List[Tuple[str, float]] = []

            for species in ms_data["data"]:
                data_key = (
                    "Partial Pressure [mbar]"
                    if "Partial Pressure [mbar]" in ms_data["data"][species]
                    else "Ion Current [A]"
                )
                data = ms_data["data"][species][data_key].to_numpy()

                ms_data["data"][species][f"{data_key} (Savitzky-Golay)"] = savgol_filter(
                    data, len(data) // 10, 3
                )

                max_vals.append((species, ms_data["data"][species][data_key].max()))

            plots = []
            for ind, (species, _) in enumerate(sorted(max_vals, key=lambda x: x[1], reverse=True)):
                plots.append(
                    selectable_axes_plot(
                        {species: ms_data["data"][species]},
                        x_options=x_options,
                        y_options=[data_key],
                        y_default=[
                            f"{data_key} (Savitzky-Golay)",
                            f"{data_key}",
                        ],
                        label_x=(ind == 0),
                        label_y=(ind == 0),
                        plot_line=True,
                        plot_points=False,
                        plot_title=f"Channel name: {species}",
                        plot_index=ind,
                        aspect_ratio=1.5,
                    )
                )

                plots[-1].children[0].xaxis[0].ticker.desired_num_ticks = 2

            # construct MxN grid of all species
            M = 3
            grid = []
            for i in range(0, len(plots), M):
                grid.append(plots[i : i + M])
            p = gridplot(grid, sizing_mode="scale_width", toolbar_location="below")

            self.data["bokeh_plot_data"] = bokeh.embed.json_item(p, theme=DATALAB_BOKEH_GRID_THEME)
accepted_file_extensions: Sequence[str]
blocktype: str
description: str
plot_functions property readonly
generate_ms_plot(self)
Source code in pydatalab/apps/tga/blocks.py
def generate_ms_plot(self):
    file_info = None
    # all_files = None
    ms_data = None

    if "file_id" not in self.data:
        LOGGER.warning("No file set in the DataBlock")
        return
    else:
        file_info = get_file_info_by_id(self.data["file_id"], update_if_live=True)
        ext = os.path.splitext(file_info["location"].split("/")[-1])[-1].lower()
        if ext not in self.accepted_file_extensions:
            LOGGER.warning(
                "Unsupported file extension (must be one of %s, not %s)",
                self.accepted_file_extensions,
                ext,
            )
            return

        ms_data = parse_mt_mass_spec_ascii(Path(file_info["location"]))

    x_options = ["Time Relative [s]"]

    if ms_data:
        # collect the maximum value of the data key for each species for plot ordering
        max_vals: List[Tuple[str, float]] = []

        for species in ms_data["data"]:
            data_key = (
                "Partial Pressure [mbar]"
                if "Partial Pressure [mbar]" in ms_data["data"][species]
                else "Ion Current [A]"
            )
            data = ms_data["data"][species][data_key].to_numpy()

            ms_data["data"][species][f"{data_key} (Savitzky-Golay)"] = savgol_filter(
                data, len(data) // 10, 3
            )

            max_vals.append((species, ms_data["data"][species][data_key].max()))

        plots = []
        for ind, (species, _) in enumerate(sorted(max_vals, key=lambda x: x[1], reverse=True)):
            plots.append(
                selectable_axes_plot(
                    {species: ms_data["data"][species]},
                    x_options=x_options,
                    y_options=[data_key],
                    y_default=[
                        f"{data_key} (Savitzky-Golay)",
                        f"{data_key}",
                    ],
                    label_x=(ind == 0),
                    label_y=(ind == 0),
                    plot_line=True,
                    plot_points=False,
                    plot_title=f"Channel name: {species}",
                    plot_index=ind,
                    aspect_ratio=1.5,
                )
            )

            plots[-1].children[0].xaxis[0].ticker.desired_num_ticks = 2

        # construct MxN grid of all species
        M = 3
        grid = []
        for i in range(0, len(plots), M):
            grid.append(plots[i : i + M])
        p = gridplot(grid, sizing_mode="scale_width", toolbar_location="below")

        self.data["bokeh_plot_data"] = bokeh.embed.json_item(p, theme=DATALAB_BOKEH_GRID_THEME)

parsers

parse_mt_mass_spec_ascii(path: Path) -> Dict[str, Union[pandas.core.frame.DataFrame, Dict]]

Parses an .asc file containing MS results from a Mettler-Toledo spectrometer and returns a dictionary with keys data and meta, which themselves contain a dictionary of dataframes for each species with the species names/masses as keys, and a dictionary of metadata fields respectively.

Parameters:

Name Type Description Default
path Path

The path of the file to parse.

required
Source code in pydatalab/apps/tga/parsers.py
def parse_mt_mass_spec_ascii(path: Path) -> Dict[str, Union[pd.DataFrame, Dict]]:
    """Parses an .asc file containing MS results from a Mettler-Toledo
    spectrometer and returns a dictionary with keys `data` and `meta`,
    which themselves contain a dictionary of dataframes for each species
    with the species names/masses as keys, and a dictionary of
    metadata fields respectively.

    Parameters:
        path: The path of the file to parse.

    """

    header_keys = ("Sourcefile", "Exporttime", "Start Time", "End Time")
    data_keys = ("Time Relative [s]", "Partial Pressure [mbar]", "Ion Current [A]")
    header = {}
    species = []

    if not path.exists():
        raise RuntimeError(f"Provided path does not exist: {path!r}")

    with open(path) as f:
        # Read start of file until all header keys have been found
        max_header_lines = 8
        reads = 0
        header_end = None
        while reads < max_header_lines:
            line = f.readline().strip()
            reads += 1
            if line:
                for key in header_keys:
                    if key in line:
                        header[key] = line.split(key)[-1].strip()
            if all(k in header for k in header_keys):
                header_end = f.tell()
                break
        else:
            raise ValueError(
                f"Could not find all header keys in first {max_header_lines} lines of file."
            )

        for key in header_keys[1:]:
            if "time" in key.lower():
                header[key] = dateutil.parser.parse(header[key])  # type: ignore

        reads = 0
        max_species_lines = 10
        while reads < max_species_lines:
            line = f.readline().strip()
            reads += 1
            if not line:
                continue
            species = line.split()
            break
        else:
            raise ValueError(
                f"Could not find species list in lines {header_end}:{header_end + max_species_lines} lines of file."
            )

        # Read data with duplicated keys: will have (column number % number of data keys) appended to them
        # MT software also writes "---" if the value is missing, so parse these as NaNs to remove later
        df = pd.read_csv(f, sep="\t", header=0, parse_dates=False, na_values=["---"])
        ms_results: Dict[str, Union[pd.DataFrame, Dict]] = {}
        ms_results["meta"] = header
        ms_results["data"] = {}

        # Some files have Ion Current [A] or Partial Pressure [mbar] -- only rename those that are present
        present_keys = set(df.columns.values) & set(data_keys)
        for ind, specie in enumerate(species):
            # Loop over all species and rename the columns to remove the species name and disaggregate as a dict
            species_data_keys = [k + f"{'.' + str(ind) if ind != 0 else ''}" for k in present_keys]
            ms_results["data"][specie] = df[species_data_keys].rename(
                {mangled: original for mangled, original in zip(species_data_keys, present_keys)},
                axis="columns",
            )

            # Drop time axis as format cannot be easily inferred and data is essentially duplicated: "Start Time" in header
            # provides the timestamp of the first row
            ms_results["data"][specie].drop("Time", axis="columns", inplace=True, errors="ignore")

            # If the file was provided in an incomplete form, the final rows will be NaN, so drop them
            ms_results["data"][specie].dropna(inplace=True)

        return ms_results