{"collectionById":{"3d6d52f4-0a1b-40c9-8839-9563e3971a6f":{"id":"3d6d52f4-0a1b-40c9-8839-9563e3971a6f","name":"Case Studies","fieldSchemas":[{"id":"a2275068-ae26-4dd5-add4-275933767ea8","name":"Slug","type":"slug","role":"slug"},{"id":"3182d33a-dbee-43c9-9931-b900cee1d70c","name":"Hero Image","type":"image"},{"id":"78fea6df-e8e8-4ba1-9f02-f908076b9ca4","name":"Content","type":"rich_text"},{"id":"0653af49-9844-4c79-9be9-6f36325da0ac","name":"Date Published","type":"date"},{"id":"81c3dafd-21d3-4e82-b304-3549b81e5502","name":"Title","type":"plain_text","role":"primary"},{"id":"c58740ec-dcc6-4a05-a207-bddab515c486","name":"Role","type":"rich_text"},{"id":"d81dd6fc-e8ef-48fc-8efc-00ff09595db8","name":"TLDR","type":"rich_text"}],"itemById":{"fe9cf38b-ce77-412f-9011-0ff99bd6c230":{"id":"fe9cf38b-ce77-412f-9011-0ff99bd6c230","index":"\"","collectionId":"3d6d52f4-0a1b-40c9-8839-9563e3971a6f","fields":[{"id":"dda1e653-936a-40f7-af10-add5e1541a57","value":"{\"image\":\"138ea7740fb0618c726a76c17ccd225c90e15053\",\"imageThumbnail\":\"1bee7548d9a72484a28710779578a9ea6ecf35b1\",\"originalImageHeight\":941,\"originalImageWidth\":1672,\"altText\":\"Video Editor Case Study\",\"fileName\":\"E4C6E581-9116-4E1F-9D12-B633FA0DE63A.png\"}","itemId":"fe9cf38b-ce77-412f-9011-0ff99bd6c230","fieldSchemaId":"3182d33a-dbee-43c9-9931-b900cee1d70c"},{"id":"b4f27593-abfd-43e7-9940-c663fcc11cbd","value":"2026-05-17","itemId":"fe9cf38b-ce77-412f-9011-0ff99bd6c230","fieldSchemaId":"0653af49-9844-4c79-9be9-6f36325da0ac"},{"id":"47a71c45-5cd6-4a3c-983e-1896af0796a7","value":"Building a Storytelling Platform for Talking-Character Video","itemId":"fe9cf38b-ce77-412f-9011-0ff99bd6c230","fieldSchemaId":"81c3dafd-21d3-4e82-b304-3549b81e5502"},{"id":"cf773f18-3e97-488f-91ee-4eb43520b2aa","value":"{\"root\":{\"children\":[{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The vision held for three years\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h2\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"In 2022, while doing generative AI work at Snap around multi-character image generation, I put together a deck describing what would come after still images: AI agents replicating the roles of a media production team — screenwriter, storyboard artist, character designer, VFX artist, sound producer — collaborating to help people tell stories. I sketched the personas, generated proof-of-concept videos by animating multi-character stills through one of Runway ML's earliest video models, and made the case that storytelling was about to become accessible in a way it never had been.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The vision didn't go anywhere immediately. The tools didn't exist. Single-shot AI video was unreliable, multi-shot narratives with character consistency were impossible, and the audio side was barely a category. So I shelved the deck and kept working — building prototypes, championing ComfyUI as Snap's internal AI platform [link: ComfyUI case study], staying close to the open-source ecosystem.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"What I learned during that period was a useful kind of patience. Engineering for a future that doesn't exist yet is mostly wasted motion. But staying close enough to the field to notice the moment an impossibility lifts is a different practice — and the moment came in late 2024.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The tools catch up\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h2\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Two open-source releases changed the calculus. WAN 2.1 pushed image-to-video past the threshold where motion stayed coherent and faces stayed faces over useful durations. Fantasy Talk, paired with TTS, produced talking-head videos from a single reference image with reasonable lip sync. Neither model would stay current for long — the ecosystem was already cycling through new versions every few weeks — but together they made one specific thing buildable: video of characters who could speak, with identities that persisted across clips.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"I narrowed the original vision to that. Rather than chase the full production-crew concept, I picked the one capability the new models made possible and committed to doing it well: \",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"talking-character video with persistent characters across multiple clips.\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" That was the MVP. Everything else could come later.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The first version was a web tool — one image, one prompt, one short clip. I built it as a frontend that called ComfyUI workflows through the platform's API. The limits of single-shot generation became obvious within hours of using it. A clip is not a story. To tell a story you need consistency across shots, audio, composition, rhythm. Each piece I added surfaced the next missing piece. Adding audio raised the question of where voices came from. Adding character consistency raised the question of where characters came from. Adding a second clip raised the question of how clips connected.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"About a month in, I realized I wasn't building a video generation tool. I was building toward the deck.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The web prototype\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h3\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"What emerged over the following months was a web-based multi-clip editor organized around \",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"per-character timelines\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\". Rather than a single track of clips, each character had their own. You created a character first — image, voice, name — and clips involving that character appeared on the character's timeline. A two-character scene meant clips on two timelines, aligned in time. The metaphor was less video editor and more small production crew, with each character carrying their own performance.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Clips could be generated text-to-video, image-to-video, with first-frame and last-frame conditioning, or with multi-character references. Each technique mapped to a specific ComfyUI workflow. The web app didn't run any AI directly — it orchestrated asset flow and called workflows by API.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The orchestration is where the engineering got interesting. I built custom ComfyUI nodes for Firebase input and output of images, video, audio, and text, letting workflows reference assets by UUID instead of file paths on whichever server happened to be running the job. I injected a job-claiming routine into the ComfyUI runtime that polled Firebase for tagged jobs and pushed status back through Realtime Database. And for live previews, I extended ComfyUI's TAESD preview support with a throttling system that wrote progressive previews to Firebase only when the previous write had completed — the in-flight write would be overridden by the newer one, with locking to keep things thread-safe. Submit a generation, see previews appear within seconds, watch them refine. For video, where generations take longer than images, watching the result emerge made the wait feel like part of the work rather than time lost.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The web version stayed internal-only. The audience was mostly the internal design team — a small set of users who came back regularly and surfaced what creative workflows actually needed. The infrastructure was deliberately prototype-grade: Firebase as a database, GCP instance groups running ComfyUI, custom nodes glued together with my own routing logic. The point was to prove that multi-clip character-driven storytelling worked, not to harden it for scale.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The most surprising lesson from the web build was about characters themselves: they don't have to be people, but they have to be personified\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\".\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" A user could take a photo of a houseplant, give it a face and a voice, and turn it into a character. The constraint wasn't human form — it was a recognizable visual identity plus the ability to talk. That changed how I thought about the audience. The most interesting outputs from early users were rarely the ones I'd designed for.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Three other things came out of the web build that shaped what came next:\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The model layer kept moving fast. Fantasy Talk gave way to MultiTalk, then Infinite Talk. On audio, I started with Chatterbox (TTS with voice conversion that preserved expressive performance), later moved through VibeVoice, ACE Step for music, and eventually Qwen TTS. Keeping current with the open-source ecosystem was half the work.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Structured agent frameworks felt over-constrained. I prototyped with CrewAI Flows mid-build and found the rigid step orderings and boilerplate didn't fit. The right place for LLM intelligence wasn't an orchestration tier outside the tool — it was inside the workflow system itself, where the same nodes that powered manual editing could also be invoked by LLMs.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"And there was a Cursor-shaped insight: Cursor is a code editor with hooks where you can do everything manually, plus an agentic interface that uses the same hooks. The manual and agentic modes share primitives; the agent is a different interface to the same system, not a separate system. If I designed the editing tool with the right hooks for manual creation, those hooks could power agentic creation later — without needing to choose between \\\"tool for humans\\\" and \\\"tool for AI.\\\"\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The iOS pivot\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h2\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The pivot happened almost by accident. Alongside the web build, I had a small side project: an iOS app for experimenting with chains of ComfyUI workflows. I showed it next to the web version during a demo to the design team. The feedback surprised me. People didn't just like it — they connected with it in a way they hadn't connected with the web version. The phones were in their hands. The interaction was tactile.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"That feedback landed for a few reasons. Snap is a mobile company; the designers I worked with built for a mobile-first product. iOS was where I had eighteen years of fluency. And the small experimental version had something the web version didn't — it felt like a creative tool you'd want to use, not a demo of what AI could do.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"I made the call to rebuild on iOS as the primary platform. The web version stayed live but stopped getting updates. Because the iOS app was built entirely in SwiftUI with no UIKit-specific dependencies, supporting macOS later would be nearly free — same codebase, different target — so the pivot wasn't closing off the desktop possibility, just choosing the right starting point.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"What I built over the following months wasn't a port. It was a rethink, organized around three sections.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Characters\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h3\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The character library was where users built the cast they'd reuse across productions. Characters persisted across sessions and projects. Creating one started with an image — take a photo, generate from text via Qwen Image, use a bitmoji, or upload existing. From any starting point, you could edit the image with Qwen Image Edit through a continuous editing surface backed by a carousel of edit history. The carousel doubled as undo: tap an earlier state to return to it, with more recent edits still accessible if you wanted to branch. The pattern came from how creative editing actually works — not a straight line, but branching, backtracking, revisiting.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Then voice: upload, record, or design from a text description. Voice design used ElevenLabs initially (no open-source option existed); I migrated to Qwen TTS once it shipped voice design. The text-to-voice option was the one users were most surprised by. \\\"A gravelly voice like an old prospector\\\" produced something that felt right far more often than expected.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"A character description, auto-populated by Florence (a vision-language model captioning the image) and editable. A name. Done. The whole flow was a small studio in itself — light enough to use casually, deep enough to craft something specific.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Productions\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h3\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"A production was a full video editor — substantially more capable than the web version's timeline. Once you'd pulled characters into a production, you could generate clips, arrange them, edit them, and export either a complete video or archive the timeline as an episode for future continuation. The episode model was deliberate: I wanted productions to feel like ongoing series, not one-off videos.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The timeline was the most complex piece of the app and probably the most fun to build. Most timeline UIs in mobile editors are scroll views of clip thumbnails. This one had that, plus a separate playback timeline with the active clip's segments chopped into thumbnails at varying resolutions. A zoom control changed thumbnail fidelity. The playback head stayed anchored at center; scrubbing meant moving the timeline underneath it. The cut tool benefited especially — with zoom and precise scrubbing, you could position cuts with the kind of accuracy professional editors expect, not the rough drag-and-drop most mobile tools settle for.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"type\":\"image\",\"version\":1,\"hash\":\"7d8112d8fddd3e8fde8c8fac461bff1ffaf4a002\",\"src\":\"https://s3-alpha-sig.figma.com/img/7d81/12d8/fddd3e8fde8c8fac461bff1ffaf4a002?Expires=1779667200\u0026Key-Pair-Id=APKAQ4GOSFWCW27IBOMQ\u0026Signature=RdyiUSYIva05w5SpDpkVF5CM53~xNM8-tSuuE6W~JeOikDI3AIaXsFKHmpHMQSXpr5njy519C7g9jWCKw8cgRUj48wuljgiffCP1B~TRT7YVowSzHk1LtWN9GEaqjr~57vvviLdgQb~oaR-bj0eq69sxxDNmhNtziEtl2ADVC0HlDpTrXY5VGw8ztdoOsq0RrAlUmcNGxu3D6V2FqkbmyiKcrlPbCF2mx6rg3pYNBfl9074YZv4wU80lONtlwGsEQUxT6e0g6SPRfWf8TMqs8iTnY-1xW0pXNZxXszfMG-0MQ9NyWMBGOw9o1h6BgebJvm35~rhi2XaVxpaHTqWdRQ__\",\"altText\":\"diagram\",\"originalImageWidth\":1672,\"originalImageHeight\":941,\"isFillWidth\":false}],\"direction\":null,\"format\":\"left\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Underneath, the video preview was a state machine. The data model — clips, ordering, characters, audio — was observed throughout the system. Any change to the model, whether from a user action or an async server event, triggered a new AVFoundation composition that propagated to every observer: the preview player updated, thumbnail strips re-rendered for new segments, the export pipeline became aware of the new state. No manual coordination. The data model was the source of truth and AVFoundation was a deterministic transformation of it.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"This made parallel generation with live previews work coherently. Users could start one clip generating and immediately start another. Each in-progress generation showed its TAESD previews in the right place on the timeline with progress indicators. Users could skip between clips, check on generations, edit arrangement — the system stayed coherent throughout. Most AI video tools make you wait, then commit, then wait again. This one let you keep moving.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Feed\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h3\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The third section was minimal: a vertical-playback feed for sharing episodes internally. Chronological, swipeable, inline. The point wasn't to build a social product — it was to give users a reason to see what others were making and to share what they'd made. Users started recognizing each other's styles. Episodes occasionally got passed around in chat channels. The scope was right.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"macOS as a feature decision\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h3\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"When users asked for desktop, macOS was almost free. The SwiftUI codebase ran on both with minor adjustments for cursor input and keyboard shortcuts. This wasn't an architectural triumph — it was SwiftUI doing what it was designed to do — but it's an example of how earlier technology choices pay off later. Choosing SwiftUI for the iOS app meant macOS support was a feature decision, not an engineering project.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The architecture that made it work\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h2\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The iOS app surfaced a problem the web prototype had hidden. With the web version, users worked one clip at a time and sequential generation was tolerable. The iOS app — with its state-machine timeline, parallel generation, and live previews — pushed the underlying infrastructure into territory the prototype-grade backend wasn't built for.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The bottleneck was model loading. Video generation models are large; loading them into GPU memory takes a minute or more. If users waited a minute every time they started a new clip because the model wasn't warm, the responsive feel of the iOS app would have evaporated. The whole experience depended on models staying loaded across requests — and the more complex features the lens team wanted needed pipelines whose stages couldn't all live in the same GPU's memory at once.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"But you can't keep every model warm on every server. The project ran on a pool of GPUs tiered across hardware classes — A100s for the heaviest video work, L4s and T4s for lighter models, CPU instances for orchestration. That pool had to support video generation, image generation and editing, audio synthesis, captioning, and increasingly LLM-driven features, alongside a staging environment that mirrored production.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The move that made everything work was \",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"specialized workflow pools with peer-pool routing\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\".\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"type\":\"image\",\"version\":1,\"hash\":\"b4bb5085f5af71f44fcd077109f90342c52ac50e\",\"src\":\"https://s3-alpha-sig.figma.com/img/b4bb/5085/f5af71f44fcd077109f90342c52ac50e?Expires=1779667200\u0026Key-Pair-Id=APKAQ4GOSFWCW27IBOMQ\u0026Signature=dloRDVV4PyZFLb5bHmCOxuHIBF2zxUFPNfdOgJnHdyjwO4~j2b-OirPwsskL3tWGgrtfjfEWu2nuej9r1VxUsw6Zo8LKOdfmin3TboeSDZvvFI7XW53fEcFEqPxpvcn6VyX7~FzGHZYJufD1fMNvcMa997dO9KvuvCn92qXvG6ofHskfoFMwtrcNYUs9FDfiDTzr65gby94gfxQikjqyCZSS1hmu~toDhexb8tHtPUNZyWasSeL2-Z7vGkLCFfAGWSh9nW1HVsDfaED8-DkdQkpcGg4nt2ZlM8K~5XMi7z1z2xl5XCfWNqKWRzQLQ6pUvq56VTyfkvzUXZecW40QVw__\",\"altText\":\"diagram\",\"originalImageWidth\":1672,\"originalImageHeight\":941,\"isFillWidth\":false}],\"direction\":null,\"format\":\"left\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"I divided the available GPUs into pools specialized by model category. Talking-character video models — Infinite Talk and its predecessors — got the largest allocation, sized to absorb demo spikes. Qwen Image and Qwen Image Edit got their own. Florence ran on T4s; audio models on L4s or T4s. Putting smaller models on cheaper hardware freed the A100s for work that actually needed them.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Each pool kept its models loaded so generation requests landing on the right pool found the right model already warm. Routing solved the \\\"right pool\\\" part: every workflow was tagged with the server type it needed. The job-claiming routine I'd injected into ComfyUI got extended — each server's claimer listened only for jobs tagged for its type. Workflows submitted from the iOS app got posted to Firebase with their server tag, the appropriate server claimed them, ran them, posted results back.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The architecture's elegance came from the peer model: every pool — GPU and CPU alike — is just a worker that claims tagged jobs from the queue, and any worker can submit new jobs back to the queue. There's no master, no orchestrator tier, no privileged role. That symmetry made two things possible that mattered more in practice than the elegance itself.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The first was a workaround for the VRAM ceiling. Some workflows needed more VRAM than a single A100 could provide — character-controlled video with pose estimation and mask generation as conditioning, for instance, ran into memory limits when every stage tried to live on the same card. Splitting the workflow across servers, with each stage tagged for the pool that fit it, let those features ship without waiting on larger or multi-GPU hardware. A workflow on the talking-character pool could generate video, then post a tagged sub-job to the audio pool for background music, then continue composition once that returned. I built remote workflow nodes for this; when ComfyUI added async node support for API calls, the same async machinery worked transparently for cross-server coordination.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The second was cost-appropriate work placement. The CPU pool wasn't there for symmetry — it was there because lightweight orchestration, LLM API calls, and async coordination didn't need GPU time at all. Letting that work run on CPU or T4 instances meant the A100 hours stayed reserved for the generation work that actually needed them. The peer-pool model made this natural to express: orchestration workflows submitted GPU sub-jobs the same way GPU workflows submitted audio sub-jobs.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Two other pieces of platform work landed during this period:\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"I ported the project's custom nodes — hundreds of them — to ComfyUI's new V2 schema. The migration was substantial but bought better typing, introspection, and composition primitives that mattered as workflows got more complex.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"And I wrapped ComfyUI's API nodes for closed-source models (OpenAI, Gemini, and others) to call Snap's internal backend instead of the providers directly. The node interfaces stayed identical, so workflows could swap between standard and internal versions transparently. A useful side effect: once the wrappers existed on experimental servers, any internal team with ComfyUI access could explore closed-source APIs without setting up their own authentication. The wrapped API nodes became shared infrastructure for closed-source model exploration across the broader team.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The infrastructure matched the project's R\u0026D stage. Firebase as a job board works at internal-team scale; a public product would warrant Vertex Engine or a hardened production ComfyUI deployment with proper job queues and workflow versioning. The architecture did what the project needed at the stage it was in.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Real adoption\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h2\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"For most of the project's life the user base was small — the internal design team, occasional demos, scattered curious users. That was the right scope for R\u0026D. What changed was a content production team adopting the iOS app heavily for their daily work. They'd discovered the app could generate the kinds of assets their pipeline needed at a fraction of the time and cost of their existing tools.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"They became the first users who weren't trying the tool out of curiosity — they were using it because it solved real problems for them. They posted to the feed regularly. They requested features. They reported bugs. After a few months of intensive iOS use, they asked for desktop. That request was where the SwiftUI bet paid off: macOS support shipped quickly, and their workflow shifted to desktop for heavier sessions while staying on iOS for quick work.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Their needs drove the next round of model integration. They wanted character-controlled video — recording themselves moving and having a character mirror the motion. That meant integrating WAN Animate, which uses pose estimation and mask generation as conditioning. Out of the box, the masking and pose stages were slow enough to make the workflow painful. I spent real time on performance — TensorRT compilation for the mask and pose models, careful management of when they loaded, integration into the routing system so the right server stayed warm. The result was a workflow fast enough to actually use rather than fast enough to demo. I added WAN 2.2 workflows as the model became available, which lifted video quality across the board.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"The shift in user base changed how I thought about the project. With the design team, the tool was creative R\u0026D — fun, but no one was depending on it. With the content team, it was infrastructure for daily work. Bugs mattered differently. Features mattered differently. The work shifted from \\\"what can this thing do\\\" to \\\"what does this team need next.\\\" That's the transition every internal tool either makes or doesn't.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"What it became, and what's next\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h2\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"By the time I moved on, the platform had reached a stable internal state. The iOS app and macOS port were in active use. The web version remained accessible but was no longer being updated. The character library, the timeline editor, the feed, the distributed workflow infrastructure — all of it was working as one coherent system.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Toward the end I shipped an agentic storyboarding feature: give the system a story prompt and a cast of characters, and a Gemini-based agent produces a structured JSON document describing the clips that compose the story — characters per clip, dialogue, action, shot type. The iOS app reads the JSON and executes it, populating the timeline. The structured JSON was the bridge: rather than asking the LLM to drive the system through a separate orchestration layer, the agent produces the same data the manual editing system already understood. A generated clip recipe and a hand-built one are the same data structure — just produced upstream. This was the Cursor-shaped pattern paying off architecturally: the manual interface and the agentic interface sharing primitives, with the agent as a different way of producing inputs to the same system rather than a separate system entirely.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"There was already LLM work threaded throughout the workflows — captioning, prompt assistance, structured generation steps — and the agentic storyboarding feature was one more application of that pattern at a higher level. It's a direction I'm continuing in current work outside Snap, where I'm building agent-driven systems that compose iOS apps using the same shared-primitives approach.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"A few lessons I'm carrying forward:\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Vision is patient.\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" The years of holding the deck while the tools didn't exist were the hardest part. Staying close enough to the open-source field to notice the moment WAN 2.1 and Fantasy Talk made the vision buildable was the work that made the rest of the work possible.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Pick the MVP.\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" The original deck described a full production crew of agents. The right move was narrowing to talking-character video with persistent characters and doing that well, then letting everything else follow. Ambition serves the work when it's pointed at one tractable thing first.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Shared primitives for human and AI use.\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" The same data model powered hand-edited timelines and agent-produced ones. The same workflows ran from manual UI and from LLM-driven calls. The same hooks served both. Designing for this from the start meant agentic features were natural extensions rather than parallel stacks.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Internal users are real users.\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" A team of professionals adopted the tool because it solved their problems better than the alternatives. Their feedback shaped the work; their needs drove the macOS port and the WAN Animate performance pass. The scale was modest by external standards. The dependency was real.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Related work:\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" This project was built on top of the ComfyUI platform described in \",\"type\":\"text\",\"version\":1},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"the [ComfyUI as Snap's Internal AI Platform case study]\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"internal-link\",\"version\":1,\"nodeId\":\"27:197\",\"cmsTarget\":null},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\", which served as the underlying execution substrate. \",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":0,\"textStyle\":\"\"}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"root\",\"version\":1}}","itemId":"fe9cf38b-ce77-412f-9011-0ff99bd6c230","fieldSchemaId":"78fea6df-e8e8-4ba1-9f02-f908076b9ca4"},{"id":"00f64cf8-56bb-4b9a-ba58-a8c261ebaf02","value":"{\"root\":{\"children\":[{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"TL;DR\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"heading\",\"version\":1,\"tag\":\"h2\"},{\"children\":[{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Held a vision for AI-driven storytelling for three years until the open-source model layer caught up, then built it as a focused MVP: talking-character video with persistent characters across clips.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"listitem\",\"version\":1,\"value\":1},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Shipped a web prototype, then a SwiftUI iOS app (with near-free macOS support) that became the daily tool for an internal content team — three-section app structure: character library, multi-clip timeline editor, internal feed.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"listitem\",\"version\":1,\"value\":2},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Designed a distributed workflow execution system on top of ComfyUI: peer-pool architecture with Firebase coordination, specialized server pools that kept models warm across tiered hardware, and wrapped API nodes that became shared infrastructure for closed-source model exploration.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"listitem\",\"version\":1,\"value\":3},{\"children\":[{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Threaded a consistent architectural principle throughout: manual and AI-driven interfaces share the same primitives. The same data structures that powered hand-edited timelines were what agentic features produced upstream.\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"start\",\"indent\":0,\"type\":\"listitem\",\"version\":1,\"value\":4}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"list\",\"version\":1,\"listType\":\"bullet\",\"start\":1,\"tag\":\"ul\"}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"root\",\"version\":1}}","itemId":"fe9cf38b-ce77-412f-9011-0ff99bd6c230","fieldSchemaId":"d81dd6fc-e8ef-48fc-8efc-00ff09595db8"},{"id":"8345c409-31fe-4ba1-9c62-b94e9a98c146","value":"{\"root\":{\"children\":[{\"children\":[{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Role:\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" Design Engineer (originator and sole builder)\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Timeframe:\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" Late 2024 – April 2026 (approximately 18 months of active development, plus earlier conceptual work)\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"},{\"children\":[{\"detail\":0,\"format\":1,\"mode\":\"normal\",\"style\":\"\",\"text\":\"Tech:\",\"type\":\"text\",\"version\":1},{\"detail\":0,\"format\":0,\"mode\":\"normal\",\"style\":\"\",\"text\":\" Swift, SwiftUI, TypeScript, React, Firebase (Realtime Database + Firestore), ComfyUI (16 A100/L4/T4 GPUs across specialized pools), AVFoundation, custom routing system, WAN 2.1/2.2, Infinite Talk, Qwen Image / Image Edit, Florence VLM, Chatterbox / VibeVoice / Qwen TTS, Gemini API, TensorRT, Tuist, GCP\",\"type\":\"text\",\"version\":1}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"paragraph\",\"version\":1,\"textFormat\":1,\"textStyle\":\"\"}],\"direction\":\"ltr\",\"format\":\"\",\"indent\":0,\"type\":\"root\",\"version\":1}}","itemId":"fe9cf38b-ce77-412f-9011-0ff99bd6c230","fieldSchemaId":"c58740ec-dcc6-4a05-a207-bddab515c486"}]}}}},"slugByItemId":{"74a083a6-3f63-42e3-a8b9-1c88571e2ece":"heyday","4b3907c4-2d07-4e41-a9a3-1e96b8250ff8":"ai-dubbing-localization-project","3ec92fb9-c574-4a8d-a52b-8184dad50eca":"from-innovation-to-production-with-comfyui","6f4f787f-0d6a-482c-b321-d72b29759e55":"evb-5-gum-react","fe9cf38b-ce77-412f-9011-0ff99bd6c230":"building-a-storytelling-platform-for-talking-character-video","9dedf784-897e-4288-8414-7faad8ee2f7b":"an-agentic-ios-app-creation-system"}}