Source code for reemote.utilities.convert_to_df

# Copyright (c) 2025 Kim Jarvis TPF Software Services S.A. kim.jarvis@tpfsystems.com 
# This software is licensed under the MIT License. See the LICENSE file for details.
#
import json
import pandas as pd


[docs]
def convert_to_df(data, columns=None):
    """Converts structured JSON data into a pandas DataFrame.

    This module provides the `convert_to_df` function, which takes structured
    data (as a JSON string or list of dictionaries) and transforms it into a
    pandas DataFrame. It is designed to flatten a nested JSON structure, often
    from command-line tool outputs, based on a predefined set of extraction rules.

    The main function, `convert_to_df`, performs several key transformations:

    - Parses the input if it is a JSON-formatted string.
    - Extracts data from nested objects (e.g., 'op', 'cp') into top-level columns.
    - Gracefully handles missing keys or objects by using empty strings as
      fallback values.
    - Allows the user to select a custom subset of columns for the final DataFrame.
    - Validates requested column names against the list of available fields.

    The function signature is:
        convert_to_df(data, columns=None)

    Args:
        data (str or list[dict]): The input data. This can be a JSON-formatted
            string or a list of dictionaries representing the rows.
        columns (list[str], optional): A list of column names to be included
            in the output DataFrame. If None, all predefined columns will be
            used. Defaults to None.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the extracted and
            flattened data, with columns ordered as specified.

    Raises:
        ValueError: If the input `data` is a string that cannot be parsed as
            JSON, or if a requested column name in the `columns` list is not
            available.
    """

    # If data is a string, parse it as JSON
    if isinstance(data, str):
        try:
            data = json.loads(data)
        except json.JSONDecodeError as e:
            raise ValueError(f"Failed to parse JSON string: {e}")

    # Define all available columns with their extraction logic
    all_columns = {
        'command': lambda item: item['op']['command'] if isinstance(item, dict) and item.get('op') else '',
        'host': lambda item: item['host'] if isinstance(item, dict) else '',
        'guard': lambda item: item['op']['guard'] if isinstance(item, dict) and item.get('op') else '',
        'changed': lambda item: item['changed'] if isinstance(item, dict) else '',
        'executed': lambda item: item['executed'] if isinstance(item, dict) else '',
        'stdout': lambda item: item['cp']['stdout'] if isinstance(item, dict) and item.get('cp') else '',
        'stderr': lambda item: item['cp']['stderr'] if isinstance(item, dict) and item.get('cp') else '',
        'exit_status': lambda item: item['cp']['exit_status'] if isinstance(item, dict) and item.get('cp') else '',
        'returncode': lambda item: item['cp']['returncode'] if isinstance(item, dict) and item.get('cp') else '',
        'env': lambda item: item['cp'].get('env', '') if isinstance(item, dict) and item.get('cp') else '',
        'subsystem': lambda item: item['cp'].get('subsystem', '') if isinstance(item, dict) and item.get('cp') else '',
        'exit_signal': lambda item: item['cp'].get('exit_signal', '') if isinstance(item, dict) and item.get('cp') else '',
        'error': lambda item: item.get('error', '') if isinstance(item, dict) else ''
    }

    # If no columns specified, use all columns
    if columns is None:
        columns = list(all_columns.keys())

    # Validate that all requested columns exist
    for col in columns:
        if col not in all_columns:
            raise ValueError(f"Column '{col}' is not available. Available columns: {list(all_columns.keys())}")

    rows = []
    for item in data:
        # Skip items that are not dictionaries
        if not isinstance(item, dict):
            print(f"Warning: Skipping non-dictionary item: {item}")
            continue

        row = {}
        for col in columns:
            try:
                row[col] = all_columns[col](item)
            except (KeyError, TypeError) as e:
                # Fallback to empty string if there's an error extracting the value
                row[col] = ''
                print(f"Warning: Could not extract column '{col}' from item: {e}")
        rows.append(row)

    # Create DataFrame with only the specified columns
    return pd.DataFrame(rows, columns=columns)