Skip to content

capturegraph.data.containers.utilities.columnize #

columnize - Transform List of Records to Dict of Columns#

Transform a List of dicts/objects into a Dict of Lists for DataFrame creation.

Example
from capturegraph.data import columnize

# sessions is List of session objects
cols = columnize(sessions)
# → Dict with column-oriented data

import pandas as pd
df = pd.DataFrame(cols)

columnize(data) #

Transform a List of dicts/objects into a Dict of Lists (column-oriented).

Takes a List of records and transposes it into a Dict of columns, where nested dicts/objects are flattened with dot-separated keys. All output lists have the same length (one entry per input item).

Parameters:

Name Type Description Default
data List[Any]

A List of dicts or objects with attributes.

required

Returns:

Type Description
Dict[str, List[Any]]

Dict mapping dot-separated keys to Lists of values.

Example
sessions = List([
    Dict({"date": "2024-01-01", "weather": Dict({"temp": 20, "humidity": 0.5})}),
    Dict({"date": "2024-01-02", "weather": Dict({"temp": 22, "humidity": 0.6})}),
])
columnize(sessions)
# Dict({
#     'date': ['2024-01-01', '2024-01-02'],
#     'weather.temp': [20, 22],
#     'weather.humidity': [0.5, 0.6]
# })

# Convert to DataFrame
import pandas as pd
df = pd.DataFrame(columnize(sessions))
Source code in capturegraph-lib/capturegraph/data/containers/utilities/columnize.py
def columnize(data: List[Any]) -> Dict[str, List[Any]]:
    """Transform a List of dicts/objects into a Dict of Lists (column-oriented).

    Takes a List of records and transposes it into a Dict of columns, where
    nested dicts/objects are flattened with dot-separated keys. All output
    lists have the same length (one entry per input item).

    Args:
        data: A List of dicts or objects with attributes.

    Returns:
        Dict mapping dot-separated keys to Lists of values.

    Example:
        ```python
        sessions = List([
            Dict({"date": "2024-01-01", "weather": Dict({"temp": 20, "humidity": 0.5})}),
            Dict({"date": "2024-01-02", "weather": Dict({"temp": 22, "humidity": 0.6})}),
        ])
        columnize(sessions)
        # Dict({
        #     'date': ['2024-01-01', '2024-01-02'],
        #     'weather.temp': [20, 22],
        #     'weather.humidity': [0.5, 0.6]
        # })

        # Convert to DataFrame
        import pandas as pd
        df = pd.DataFrame(columnize(sessions))
        ```
    """
    if not data:
        return Dict({})

    def _get_keys(obj: Any, prefix: str = "") -> list[str]:
        """Recursively get all dot-separated keys from an object."""
        keys = []

        if isinstance(obj, dict):
            items = obj.items()
        else:
            var_keys = _vars(obj)
            if var_keys:
                items = [(k, getattr(obj, k)) for k in var_keys]
            else:
                # Leaf value
                return [prefix] if prefix else []

        for key, value in items:
            full_key = f"{prefix}.{key}" if prefix else key

            # Don't recurse into lists/tuples
            if isinstance(value, (list, tuple)):
                keys.append(full_key)
            elif isinstance(value, dict):
                keys.extend(_get_keys(value, full_key))
            else:
                var_keys = _vars(value)
                if var_keys:
                    keys.extend(_get_keys(value, full_key))
                else:
                    keys.append(full_key)

        return keys

    def _get_value(obj: Any, key: str) -> Any:
        """Get a value from an object using dot-separated key."""
        parts = key.split(".")
        current = obj
        for part in parts:
            if isinstance(current, dict):
                current = current.get(part)
            else:
                current = getattr(current, part, None)
            if current is None:
                return None
        return current

    # Collect all keys from all items
    all_keys: set[str] = set()
    for item in data:
        all_keys.update(_get_keys(item))

    # Build result dict with lists
    result: dict[str, list[Any]] = {key: [] for key in sorted(all_keys)}

    for item in data:
        for key in result:
            result[key].append(_get_value(item, key))

    return Dict(result)