> ## Documentation Index
> Fetch the complete documentation index at: https://docs.baseten.co/llms.txt
> Use this file to discover all available pages before exploring further.

# Streaming

> Return model output token by token as it is generated.

export const MiniStreaming = () => {
  const ref = React.useRef(null);
  const init = React.useRef(false);
  React.useEffect(() => {
    if (!ref.current || init.current) return;
    init.current = true;
    const W = 620, H = 158, padL = 16, padR = 16, labelW = 100, TOKENS = 18;
    const TOK = ["Tok", "ens", "stream", "out", "one", "by", "one", "from", "the", "first", "tok", "en", "on", "ward", "to", "you", "now", "."];
    const isDark = () => document.documentElement.classList.contains("dark");
    const C = () => isDark() ? {
      sub: "#869089",
      body: "#dee4de",
      brd: "#344339",
      track: "rgba(255,255,255,0.05)",
      stripBg: "#0C1D13",
      stripBrd: "#203026",
      head: "#9CA59E",
      ok: ["#17D465", "rgba(23,212,101,0.22)"],
      wait: ["#5b9dff", "rgba(91,157,255,0.18)"],
      ttft: "#d6a52a"
    } : {
      sub: "#869089",
      body: "#021309",
      brd: "#dee4de",
      track: "rgba(0,0,0,0.04)",
      stripBg: "#f4f9f3",
      stripBrd: "#dee4de",
      head: "#5a675e",
      ok: ["#0e863f", "rgba(178,247,207,0.7)"],
      wait: ["#1960d3", "rgba(25,96,211,0.12)"],
      ttft: "#9c7400"
    };
    function setRich(el, s) {
      el.replaceChildren();
      const parts = s.split("`");
      for (let i = 0; i < parts.length; i++) {
        if (i % 2 === 0) {
          if (parts[i]) el.appendChild(document.createTextNode(parts[i]));
        } else {
          const c = document.createElement("code");
          c.textContent = parts[i];
          c.style.cssText = "font-family:ui-monospace,Menlo,monospace;font-size:0.92em;background:" + (isDark() ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.05)") + ";padding:1px 4px;border-radius:3px";
          el.appendChild(c);
        }
      }
    }
    let p = 0, endHold = 0, visible = true, raf = 0, last = 0;
    const cv = document.createElement("canvas");
    cv.style.cssText = "display:block;width:100%;max-width:" + W + "px;touch-action:pan-y";
    const ctx = cv.getContext("2d");
    const dpr = window.devicePixelRatio || 1;
    cv.width = W * dpr;
    cv.height = H * dpr;
    cv.style.height = H + "px";
    ctx.scale(dpr, dpr);
    const strip = document.createElement("div");
    const sDot = document.createElement("span");
    sDot.style.cssText = "flex:0 0 auto;width:8px;height:8px;border-radius:50%;margin-top:6px";
    const sTxt = document.createElement("div");
    sTxt.style.cssText = "flex:1;min-width:0";
    const sTit = document.createElement("div");
    sTit.style.cssText = "font:500 11px ui-monospace,Menlo,monospace;letter-spacing:-0.28px;color:#869089;margin:0 0 2px";
    const sBod = document.createElement("div");
    sBod.style.cssText = "font:400 13px/1.4 system-ui,-apple-system,sans-serif;margin:0";
    sTxt.appendChild(sTit);
    sTxt.appendChild(sBod);
    strip.appendChild(sDot);
    strip.appendChild(sTxt);
    ref.current.appendChild(cv);
    ref.current.appendChild(strip);
    const trackX0 = padL + labelW, trackX1 = W - padR, trackW = trackX1 - trackX0;
    const preFrac = 0.16, ttftX = trackX0 + preFrac * trackW;
    const yS = 62, yB = 108, ch = 24;
    ctx.font = "600 9px ui-monospace,Menlo,monospace";
    const avail = trackX1 - ttftX, tokPad = 10;
    const natural = TOK.map(t => ctx.measureText(t).width + tokPad);
    const tokScale = avail / natural.reduce((a, b) => a + b, 0);
    const tokPx = Math.max(7, 9 * Math.min(1, tokScale));
    const cellX = [], cellW = [];
    {
      let acc = ttftX;
      for (const n of natural) {
        const w = n * tokScale;
        cellX.push(acc);
        cellW.push(w);
        acc += w;
      }
    }
    function rr(x, y, w, h, r) {
      ctx.beginPath();
      ctx.roundRect(x, y, w, h, r);
    }
    function laneTokens(y, headX, allDone, col) {
      const preEnd = Math.min(headX, ttftX);
      rr(trackX0, y - ch / 2, ttftX - trackX0, ch, 3);
      ctx.fillStyle = col.wait[1];
      ctx.fill();
      ctx.strokeStyle = col.wait[0];
      ctx.lineWidth = 1;
      ctx.stroke();
      if (preEnd > trackX0) {
        ctx.save();
        ctx.beginPath();
        ctx.rect(trackX0, y - ch / 2, preEnd - trackX0, ch);
        ctx.clip();
        rr(trackX0, y - ch / 2, ttftX - trackX0, ch, 3);
        ctx.fillStyle = col.wait[0];
        ctx.globalAlpha = 0.28;
        ctx.fill();
        ctx.globalAlpha = 1;
        ctx.restore();
      }
      ctx.fillStyle = col.sub;
      ctx.font = "500 9px ui-monospace,Menlo,monospace";
      ctx.textAlign = "center";
      ctx.textBaseline = "middle";
      if (ttftX - trackX0 > 36) ctx.fillText("prefill", (trackX0 + ttftX) / 2, y);
      for (let i = 0; i < TOKENS; i++) {
        const x = cellX[i], w = cellW[i], on = allDone || headX >= x + w * 0.5;
        rr(x + 1, y - ch / 2, w - 2, ch, 2);
        ctx.fillStyle = on ? col.ok[1] : "transparent";
        ctx.fill();
        ctx.strokeStyle = on ? col.ok[0] : col.brd;
        ctx.lineWidth = 1;
        ctx.stroke();
        if (on) {
          ctx.fillStyle = col.ok[0];
          ctx.font = "600 " + tokPx + "px ui-monospace,Menlo,monospace";
          ctx.textAlign = "center";
          ctx.textBaseline = "middle";
          ctx.fillText(TOK[i], x + w / 2, y + 1);
        }
      }
    }
    function draw() {
      const col = C(), headX = trackX0 + p * trackW, done = endHold > 0;
      ctx.clearRect(0, 0, W, H);
      ctx.font = "500 9px ui-monospace,Menlo,monospace";
      ctx.fillStyle = col.sub;
      ctx.textBaseline = "middle";
      ctx.textAlign = "left";
      ctx.fillText("request sent", trackX0, 18);
      ctx.textAlign = "right";
      ctx.fillText("generation complete", trackX1, 18);
      ctx.textAlign = "right";
      ctx.font = "600 11px ui-monospace,Menlo,monospace";
      ctx.fillStyle = col.ok[0];
      ctx.fillText("streaming", trackX0 - 12, yS);
      ctx.fillStyle = col.sub;
      ctx.fillText("non-streaming", trackX0 - 12, yB);
      laneTokens(yS, headX, false, col);
      laneTokens(yB, done ? trackX1 + 50 : trackX0, done, col);
      ctx.strokeStyle = col.ttft;
      ctx.lineWidth = 1;
      ctx.setLineDash([3, 2]);
      ctx.beginPath();
      ctx.moveTo(ttftX, yS - ch / 2 - 12);
      ctx.lineTo(ttftX, yB + ch / 2 + 4);
      ctx.stroke();
      ctx.setLineDash([]);
      ctx.fillStyle = col.ttft;
      ctx.font = "600 9px ui-monospace,Menlo,monospace";
      ctx.textAlign = "center";
      ctx.textBaseline = "alphabetic";
      ctx.fillText("TTFT", ttftX, yS - ch / 2 - 15);
      if (!done) {
        ctx.strokeStyle = col.head;
        ctx.globalAlpha = 0.6;
        ctx.lineWidth = 1;
        ctx.beginPath();
        ctx.moveTo(headX, yS - ch / 2 - 4);
        ctx.lineTo(headX, yB + ch / 2 + 4);
        ctx.stroke();
        ctx.globalAlpha = 1;
      }
      const sTok = done ? TOKENS : cellX.filter((x, i) => headX >= x + cellW[i] * 0.5).length;
      ctx.font = "500 9px ui-monospace,Menlo,monospace";
      ctx.textAlign = "left";
      ctx.textBaseline = "alphabetic";
      ctx.fillStyle = col.sub;
      ctx.fillText("streaming: " + sTok + " / " + TOKENS + " tokens shown      non-streaming: " + (done ? TOKENS + " (all at once)" : "0 (still waiting)"), trackX0, H - 8);
      const e = done ? ["Same total time, different experience", "Both responses finish together, but the non-streaming caller waited through the whole generation while the streaming caller saw output from the first token onward."] : ["Streaming sends tokens as they generate", "With `stream=True`, each token is delivered the moment it is produced, so output appears at the first token. A non-streaming caller receives nothing until the full response is ready."];
      sDot.style.background = done ? col.ttft : col.ok[0];
      setRich(sTit, e[0]);
      setRich(sBod, e[1]);
      sBod.style.color = col.body;
      strip.style.cssText = "display:flex;align-items:flex-start;gap:10px;padding:10px 14px;margin:10px 0 0;border-radius:6px;height:76px;overflow:hidden;background:" + col.stripBg + ";border:1px solid " + col.stripBrd;
    }
    const io = new IntersectionObserver(en => visible = en[0].isIntersecting, {
      threshold: 0.1
    });
    io.observe(cv);
    const themeObs = new MutationObserver(() => draw());
    themeObs.observe(document.documentElement, {
      attributes: true,
      attributeFilter: ["class"]
    });
    function loop(ts) {
      raf = requestAnimationFrame(loop);
      if (!visible) {
        last = ts;
        return;
      }
      const dt = last ? Math.min(0.05, (ts - last) / 1000) : 0;
      last = ts;
      if (endHold > 0) {
        endHold -= dt;
        if (endHold <= 0) p = 0;
      } else {
        p += dt * 0.16;
        if (p >= 1) {
          p = 1;
          endHold = 1.8;
        }
      }
      draw();
    }
    draw();
    raf = requestAnimationFrame(loop);
    return () => {
      cancelAnimationFrame(raf);
      io.disconnect();
      themeObs.disconnect();
      cv.remove();
      strip.remove();
      init.current = false;
    };
  }, []);
  return <div ref={ref} />;
};

Streaming refers to returning a model's output incrementally, token by token, as it is generated, rather than holding the response until generation finishes. The caller reads the output as it builds, so the first tokens arrive after the time to first token (TTFT) instead of after the entire response.

Baseten supports streaming across a range of inference surfaces: [Model APIs](/inference/model-apis/overview) (hosted, OpenAI- and Anthropic-compatible endpoints), [BIS-LLM](/engines/bis-llm/overview), and dedicated deployments of models packaged with [Truss](/development/model/overview). [Custom Docker containers](/development/model/custom-server) that expose an OpenAI-compatible API, such as vLLM and SGLang, stream the same way.

Use streaming when:

* Generating the complete output takes a relatively long time.
* The first tokens are useful without the rest of the output.
* Reducing the time to first token improves the user experience.

Chat applications backed by LLMs are the clearest example.

## Enable streaming

Streaming is a per-request flag: set it on your call, then read the response as it arrives. The flag is the same everywhere; only the base URL and model slug differ.

<CodeGroup>
  ```python Truss theme={"system"}
  # Self-deployed Truss model: stream from the model's predict endpoint
  import os
  import requests

  model_id = "YOUR_MODEL_ID"

  with requests.post(
      f"https://model-{model_id}.api.baseten.co/production/predict",
      headers={"Authorization": f"Bearer {os.environ['BASETEN_API_KEY']}"},
      json={"prompt": "Write a haiku about the ocean.", "stream": True},
      stream=True,
  ) as resp:
      for chunk in resp.iter_content():
          print(chunk.decode("utf-8"), end="", flush=True)
  ```

  ```python OpenAI theme={"system"}
  # Model APIs: OpenAI-compatible endpoint at inference.baseten.co
  import os
  from openai import OpenAI

  client = OpenAI(
      base_url="https://inference.baseten.co/v1",
      api_key=os.environ["BASETEN_API_KEY"],
  )

  stream = client.chat.completions.create(
      model="deepseek-ai/DeepSeek-V4-Pro",
      messages=[{"role": "user", "content": "Write a haiku about the ocean."}],
      stream=True,
  )
  for chunk in stream:
      print(chunk.choices[0].delta.content or "", end="", flush=True)
  ```

  ```python Anthropic theme={"system"}
  # Model APIs: Anthropic-compatible endpoint (beta) at inference.baseten.co
  import os
  import anthropic

  api_key = os.environ["BASETEN_API_KEY"]

  client = anthropic.Anthropic(
      base_url="https://inference.baseten.co",
      api_key=api_key,
      default_headers={"Authorization": f"Bearer {api_key}"},
  )

  with client.messages.stream(
      model="deepseek-ai/DeepSeek-V4-Pro",
      max_tokens=4096,
      messages=[{"role": "user", "content": "Write a haiku about the ocean."}],
  ) as stream:
      for text in stream.text_stream:
          print(text, end="", flush=True)
  ```

  ```bash cURL theme={"system"}
  # Model APIs: add "stream": true and keep the connection open with --no-buffer
  curl https://inference.baseten.co/v1/chat/completions \
    -H "Content-Type: application/json" \
    -H "Authorization: Bearer $BASETEN_API_KEY" \
    -d '{
      "model": "deepseek-ai/DeepSeek-V4-Pro",
      "messages": [{"role": "user", "content": "Write a haiku about the ocean."}],
      "stream": true
    }' \
    --no-buffer
  ```
</CodeGroup>

Streaming changes when the caller sees output, not how much the model produces. The following diagram puts both delivery modes on one clock.

<MiniStreaming />

The top lane streams: after a short prefill, tokens fill in one at a time from the first-token mark (TTFT). The bottom lane is non-streaming: it stays empty through the same generation, then the whole response lands at once at the end. Both finish together, so the only difference is when the caller first sees output. Token timing here is illustrative, not a measured latency.
