Retrieve data for custom analysis

In this example, we demonstrate how to retrieve data from Nebuly endpoints and generate a CSV file suitable for further analysis. Specifically, we show how to use the get-interaction-aggregates and get-interactions endpoints to create a CSV where each row represents a user action along with its associated topic, including a set of 10 sampled interactions for each action.

import os, csv, time
from typing import Dict, List
import requests
import pandas as pd
from tqdm import tqdm


BASE = "https://backend.nebuly.com/api/external"
AGG   = "/get-interaction-aggregates"       
LIST  = "/get-interactions"  

time_range = {
    "start": "2024-01-01T00:00:00Z",  # Set here the date from which you want to get the topics
    "end": "2025-06-04T23:59:59Z"     # Set here the date to which you want to get the topics
}

HEADERS = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {os.getenv('NEBULY_API_KEY')}"
}
if HEADERS["Authorization"] == "Bearer None":
    raise RuntimeError("Please set NEBULY_API_KEY in your environment")

def post(endpoint: str, payload: Dict) -> Dict:
    """Thin wrapper with basic error handling and auto-retry"""
    for attempt in range(3):
        r = requests.post(BASE + endpoint, json=payload, headers=HEADERS, timeout=30)
        if r.ok:
            return r.json()
        if r.status_code >= 500:
            time.sleep(2 + attempt)      # simple back-off
            continue
        raise RuntimeError(f"{r.status_code}: {r.text}")
    raise RuntimeError("Nebuly API repeatedly failed")

# 1️⃣  ────────────────────────────────────────────────────────────
# get *all* topics sorted by interaction_count (descending)
topic_resp = post(
    AGG,
    {
        "group_by": {"kind": "topic"},
        "variables": ["n_interactions", "n_users"],
        "filters": [],
        "time_range": time_range,
        "offset": 0,
        "limit": 100,
    },
)
topics = sorted(
    topic_resp["data"],
    key=lambda d: d["n_interactions"],
    reverse=True,
)  # [{"group_name": "topic 1", "n_interactions": x1, "n_users": y1 ...}, {"group_name": "topic 2", "n_interactions": x2, "n_users": y2 ...}, ...]


# 2️⃣ & 3️⃣  ──────────────────────────────────────────────────────
# Fetch actions and samples for each topic
rows: List[Dict] = []
print("Fetching actions + samples …\n")
for t in tqdm(topics, unit="topic"):
    topic = t["group_name"]
    # a) actions for this topic
    actions_resp = post(
        AGG,
        {
            "group_by": {"kind": "user_action"},
            "variables": ["n_interactions", "n_users"],
            "filters": [{"kind": "topic", "values": [topic]}],
            "time_range": time_range,
            "offset": 0,
            "limit": 100,
        },
    )
    for act in actions_resp["data"]:
        action   = act["group_name"]
        n_inter  = act["n_interactions"]
        # b) ~10 interaction samples for this action
        int_resp = post(
            LIST,
            {
                "limit": 10,
                "filters": [
                    {"kind": "topic", "values": [topic]},
                    {"kind": "user_action", "values": [action]}
                ],
                "time_range": time_range,
            },
        )
        samples = [{"input": i["input_text"], "output": i["output_text"]} for i in int_resp["data"]]
        rows.append(
            {
                "topic": topic,
                "action": action,
                "interactions": n_inter,
                "samples": samples
            }
        )

# 4️⃣  ────────────────────────────────────────────────────────────
df = pd.DataFrame(rows)
df.to_csv("nebuly_topic_actions.csv", index=False, quoting=csv.QUOTE_ALL)