Skip to content

Commit 53641c3

Browse files
MarkShawn2020claude
andcommitted
feat(import): ChatGPT export → canonical JSONL importer
Phase 3.5. lovcode-core::import::chatgpt: - import_zip(path) and import_dir(path) read a ChatGPT data export (`Settings → Data controls → Export data`) — either the raw `.zip` or the unzipped directory containing `conversations.json`. - The exporter's tree (parent-child `mapping` indexed by msg id + `current_node`) is linearized into a flat message list by walking parent links from the current node to the root. - Each conversation is converted to canonical Conversation JSON and written to ~/.lovcode/imports/chatgpt/<basename>.jsonl, where the existing ImportedAdapter picks it up. CLI: - New `lovcode import <path> [--out DIR]` subcommand. Tells the user to run `lovcode index` afterwards. End-to-end smoke: a synthetic 3-message conversations.json was imported, indexed, and searched ("Tokio echo" → ChatGPT hit with correct linear transcript and CJK title). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent d5d34a4 commit 53641c3

7 files changed

Lines changed: 268 additions & 1 deletion

File tree

Cargo.lock

Lines changed: 31 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
use anyhow::Result;
2+
use clap::Args as ClapArgs;
3+
use lovcode_core::import;
4+
use std::path::PathBuf;
5+
6+
#[derive(ClapArgs)]
7+
pub struct Args {
8+
/// Path to a ChatGPT export `.zip` (or its unzipped directory containing
9+
/// `conversations.json`).
10+
pub path: PathBuf,
11+
12+
/// Optional override for output directory.
13+
#[arg(long)]
14+
pub out: Option<PathBuf>,
15+
}
16+
17+
pub fn run(args: Args) -> Result<()> {
18+
let p = &args.path;
19+
let out = if p.is_dir() {
20+
import::chatgpt::import_dir(p, args.out.as_deref())?
21+
} else {
22+
import::chatgpt::import_zip(p, args.out.as_deref())?
23+
};
24+
eprintln!("Imported → {}", out.display());
25+
eprintln!("Run `lovcode index` to make it searchable.");
26+
Ok(())
27+
}

crates/lovcode-cli/src/main.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//! `lovcode` — single binary, several subcommands.
22
3+
mod cmd_import;
34
mod cmd_index;
45
mod cmd_mcp;
56
mod cmd_search;
@@ -32,6 +33,9 @@ enum Cmd {
3233
/// Show one conversation by id.
3334
Show(cmd_show::Args),
3435

36+
/// Import a third-party export (currently: ChatGPT `.zip`).
37+
Import(cmd_import::Args),
38+
3539
/// List configured source adapters.
3640
Sources,
3741

@@ -60,6 +64,7 @@ fn main() -> anyhow::Result<()> {
6064
Cmd::Index(a) => cmd_index::run(&index_dir, a),
6165
Cmd::Search(a) => cmd_search::run(&index_dir, a),
6266
Cmd::Show(a) => cmd_show::run(&index_dir, a),
67+
Cmd::Import(a) => cmd_import::run(a),
6368
Cmd::Sources => cmd_sources::run(&index_dir),
6469
Cmd::Serve(a) => cmd_serve::run(&index_dir, a),
6570
Cmd::Mcp => cmd_mcp::run(&index_dir),

crates/lovcode-core/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ dirs = { workspace = true }
2121
rayon = { workspace = true }
2222
walkdir = { workspace = true }
2323
tracing = { workspace = true }
24+
zip = { version = "2", default-features = false, features = ["deflate"] }
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
//! Convert a ChatGPT `.zip` export (Settings → Data controls → Export data)
2+
//! into canonical Conversation JSON-lines under
3+
//! `~/.lovcode/imports/chatgpt/<basename>.jsonl`.
4+
5+
use crate::types::{Conversation, Message, Role};
6+
use anyhow::{anyhow, Context, Result};
7+
use chrono::{DateTime, TimeZone, Utc};
8+
use serde::Deserialize;
9+
use std::collections::HashMap;
10+
use std::fs::File;
11+
use std::io::{BufReader, Read, Write};
12+
use std::path::{Path, PathBuf};
13+
14+
#[derive(Debug, Deserialize)]
15+
struct GptConv {
16+
id: Option<String>,
17+
title: Option<String>,
18+
create_time: Option<f64>,
19+
update_time: Option<f64>,
20+
#[serde(default)]
21+
mapping: HashMap<String, MapNode>,
22+
current_node: Option<String>,
23+
}
24+
25+
#[derive(Debug, Deserialize)]
26+
struct MapNode {
27+
#[serde(default)]
28+
message: Option<GptMsg>,
29+
parent: Option<String>,
30+
#[serde(default)]
31+
children: Vec<String>,
32+
#[serde(default)]
33+
#[allow(dead_code)]
34+
id: Option<String>,
35+
}
36+
37+
#[derive(Debug, Deserialize)]
38+
struct GptMsg {
39+
id: Option<String>,
40+
author: Option<Author>,
41+
create_time: Option<f64>,
42+
content: Option<MsgContent>,
43+
}
44+
45+
#[derive(Debug, Deserialize)]
46+
struct Author {
47+
role: Option<String>,
48+
}
49+
50+
#[derive(Debug, Deserialize)]
51+
struct MsgContent {
52+
content_type: Option<String>,
53+
#[serde(default)]
54+
parts: Vec<serde_json::Value>,
55+
}
56+
57+
/// Import a ChatGPT export. Returns the path of the written JSONL.
58+
pub fn import_zip(zip_path: &Path, out_dir: Option<&Path>) -> Result<PathBuf> {
59+
let conversations = read_zip(zip_path)?;
60+
write_jsonl(&conversations, zip_path, out_dir)
61+
}
62+
63+
/// Import a directory containing a `conversations.json` (already unzipped).
64+
pub fn import_dir(dir: &Path, out_dir: Option<&Path>) -> Result<PathBuf> {
65+
let conv_json = dir.join("conversations.json");
66+
let raw = std::fs::read_to_string(&conv_json)
67+
.with_context(|| format!("read {}", conv_json.display()))?;
68+
let conversations = parse_conversations_json(&raw)?;
69+
write_jsonl(&conversations, dir, out_dir)
70+
}
71+
72+
fn read_zip(zip_path: &Path) -> Result<Vec<Conversation>> {
73+
let file = File::open(zip_path).with_context(|| format!("open {}", zip_path.display()))?;
74+
let mut archive = zip::ZipArchive::new(BufReader::new(file))
75+
.with_context(|| format!("read zip {}", zip_path.display()))?;
76+
let mut entry = archive
77+
.by_name("conversations.json")
78+
.map_err(|_| anyhow!("zip does not contain conversations.json"))?;
79+
let mut raw = String::new();
80+
entry.read_to_string(&mut raw)?;
81+
parse_conversations_json(&raw)
82+
}
83+
84+
fn parse_conversations_json(raw: &str) -> Result<Vec<Conversation>> {
85+
let gpts: Vec<GptConv> = serde_json::from_str(raw).context("parse conversations.json")?;
86+
Ok(gpts
87+
.into_iter()
88+
.filter_map(|c| convert(c).ok())
89+
.filter(|c| !c.messages.is_empty())
90+
.collect())
91+
}
92+
93+
fn convert(c: GptConv) -> Result<Conversation> {
94+
let id = c.id.clone().ok_or_else(|| anyhow!("missing id"))?;
95+
let messages = linearize(&c);
96+
97+
Ok(Conversation {
98+
id,
99+
source: "chatgpt".to_string(),
100+
project: None,
101+
title: c.title,
102+
created_at: c.create_time.and_then(epoch_to_dt),
103+
updated_at: c.update_time.and_then(epoch_to_dt),
104+
messages,
105+
raw_path: None,
106+
})
107+
}
108+
109+
/// Walk parent→child from current_node back to root, collecting messages.
110+
fn linearize(c: &GptConv) -> Vec<Message> {
111+
let Some(mut node_id) = c.current_node.clone().or_else(|| {
112+
// Fallback: pick any leaf (no children).
113+
c.mapping
114+
.iter()
115+
.find(|(_, n)| n.children.is_empty())
116+
.map(|(id, _)| id.clone())
117+
}) else {
118+
return Vec::new();
119+
};
120+
121+
let mut path: Vec<&MapNode> = Vec::new();
122+
let mut visited = std::collections::HashSet::new();
123+
loop {
124+
if !visited.insert(node_id.clone()) { break; }
125+
let Some(node) = c.mapping.get(&node_id) else { break };
126+
path.push(node);
127+
let Some(parent) = node.parent.clone() else { break };
128+
node_id = parent;
129+
}
130+
path.reverse();
131+
132+
path.iter()
133+
.filter_map(|n| n.message.as_ref().and_then(msg_to_message))
134+
.collect()
135+
}
136+
137+
fn msg_to_message(m: &GptMsg) -> Option<Message> {
138+
let role_str = m.author.as_ref().and_then(|a| a.role.as_deref()).unwrap_or("");
139+
let role = match role_str {
140+
"user" => Role::User,
141+
"assistant" => Role::Assistant,
142+
"system" => Role::System,
143+
"tool" => Role::Tool,
144+
_ => return None, // skip "function", empty, etc.
145+
};
146+
let content = m
147+
.content
148+
.as_ref()
149+
.map(|c| {
150+
if c.content_type.as_deref() == Some("text") || c.content_type.is_none() {
151+
c.parts
152+
.iter()
153+
.filter_map(|v| v.as_str().map(|s| s.to_string()))
154+
.collect::<Vec<_>>()
155+
.join("\n")
156+
} else {
157+
String::new()
158+
}
159+
})
160+
.unwrap_or_default();
161+
let content = content.trim().to_string();
162+
if content.is_empty() { return None; }
163+
164+
Some(Message {
165+
role,
166+
content,
167+
timestamp: m.create_time.and_then(epoch_to_dt),
168+
})
169+
}
170+
171+
fn epoch_to_dt(secs: f64) -> Option<DateTime<Utc>> {
172+
Utc.timestamp_opt(secs as i64, 0).single()
173+
}
174+
175+
fn write_jsonl(conversations: &[Conversation], source: &Path, out_dir: Option<&Path>) -> Result<PathBuf> {
176+
let dir = match out_dir {
177+
Some(d) => d.to_path_buf(),
178+
None => dirs::home_dir()
179+
.unwrap_or_else(|| PathBuf::from("."))
180+
.join(".lovcode")
181+
.join("imports")
182+
.join("chatgpt"),
183+
};
184+
std::fs::create_dir_all(&dir)?;
185+
let basename = source
186+
.file_stem()
187+
.and_then(|s| s.to_str())
188+
.unwrap_or("chatgpt-export");
189+
let out = dir.join(format!("{basename}.jsonl"));
190+
191+
let mut f = File::create(&out)?;
192+
for c in conversations {
193+
serde_json::to_writer(&mut f, c)?;
194+
writeln!(f)?;
195+
}
196+
tracing::info!(count = conversations.len(), path = %out.display(), "wrote chatgpt import");
197+
Ok(out)
198+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
//! Importers for third-party export formats. Converted output is written
2+
//! as canonical `Conversation` JSON-lines under
3+
//! `~/.lovcode/imports/<source>/`, ready for the matching `ImportedAdapter`.
4+
5+
pub mod chatgpt;

crates/lovcode-core/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ pub mod index;
1515
pub mod query;
1616
pub mod watcher;
1717
pub mod detail;
18+
pub mod import;

0 commit comments

Comments
 (0)