Streaming CSV import (Node streams)

7271 views

                  import fs from 'node:fs';
import { parse } from 'csv-parse';
import { withClient } from '../db/pool';

export async function importCsv(path: string) {
  const parser = fs.createReadStream(path).pipe(parse({ columns: true, trim: true }));

  const batch: Array<{ email: string; name: string }> = [];
  for await (const row of parser) {
    if (!row.email) continue;
    batch.push({ email: row.email, name: row.name ?? '' });

    if (batch.length >= 500) {
      await insertBatch(batch.splice(0, batch.length));
    }
  }

  if (batch.length) await insertBatch(batch);
}

async function insertBatch(rows: Array<{ email: string; name: string }>) {
  await withClient(async (client) => {
    const values = rows
      .map((r, i) => `($${i * 2 + 1}, $${i * 2 + 2})`)
      .join(',');
    const params = rows.flatMap((r) => [r.email, r.name]);

    await client.query(
      `INSERT INTO users(email, name) VALUES ${values} ON CONFLICT (email) DO UPDATE SET name = EXCLUDED.name`,
      params
    );
  });
}

The first time a CSV import OOM’d a production process, I stopped trusting ‘just read it into memory’. Now I stream the upload to disk (or S3), stream-parse rows, and batch inserts into the DB. The win is that memory usage stays flat, even when the file is huge. I also validate per-row and collect rejected rows into an error report instead of failing the entire import. Backpressure matters here: await your DB writes so the parser doesn’t outrun your database. If performance becomes an issue, I add a small concurrency limit, but I never go fully unbounded. This keeps imports predictable, debuggable, and safe as data sizes grow.

Mateo Rodriguez

More from Mateo Rodriguez

Streaming CSV import (Node streams)

0 Comments

More from Mateo Rodriguez