nsarrazin HF Staff commited on
Commit
dd87ed5
·
unverified ·
1 Parent(s): 0a11e11

feat(migrations): clean up empty conversations from db (#1561)

Browse files

* feat(migrations): clean up empty conversations from db

* fix(tests): better test coverage, fix bug with sessions

* feat: better migration

* fix: stop test server after running tests

* fix: migration code

* fix: refactor to take into account large size of session collection

scripts/setupTest.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { vi } from "vitest";
2
  import dotenv from "dotenv";
3
  import { resolve } from "path";
4
  import fs from "fs";
@@ -41,3 +41,9 @@ vi.mock("$env/dynamic/private", async () => {
41
  },
42
  };
43
  });
 
 
 
 
 
 
 
1
+ import { vi, afterAll } from "vitest";
2
  import dotenv from "dotenv";
3
  import { resolve } from "path";
4
  import fs from "fs";
 
41
  },
42
  };
43
  });
44
+
45
+ afterAll(async () => {
46
+ if (mongoServer) {
47
+ await mongoServer.stop();
48
+ }
49
+ });
src/lib/migrations/routines/09-delete-empty-conversations.spec.ts ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { Session } from "$lib/types/Session";
2
+ import type { User } from "$lib/types/User";
3
+ import type { Conversation } from "$lib/types/Conversation";
4
+ import { ObjectId } from "mongodb";
5
+ import { deleteConversations } from "./09-delete-empty-conversations";
6
+ import { afterAll, afterEach, beforeAll, describe, expect, test } from "vitest";
7
+ import { collections } from "$lib/server/database";
8
+
9
+ type Message = Conversation["messages"][number];
10
+
11
+ const userData = {
12
+ _id: new ObjectId(),
13
+ createdAt: new Date(),
14
+ updatedAt: new Date(),
15
+ username: "new-username",
16
+ name: "name",
17
+ avatarUrl: "https://example.com/avatar.png",
18
+ hfUserId: "9999999999",
19
+ } satisfies User;
20
+ Object.freeze(userData);
21
+
22
+ const sessionForUser = {
23
+ _id: new ObjectId(),
24
+ createdAt: new Date(),
25
+ updatedAt: new Date(),
26
+ userId: userData._id,
27
+ sessionId: "session-id-9999999999",
28
+ expiresAt: new Date(Date.now() + 1000 * 60 * 60 * 24),
29
+ } satisfies Session;
30
+ Object.freeze(sessionForUser);
31
+
32
+ const userMessage = {
33
+ from: "user",
34
+ id: "user-message-id",
35
+ content: "Hello, how are you?",
36
+ } satisfies Message;
37
+
38
+ const assistantMessage = {
39
+ from: "assistant",
40
+ id: "assistant-message-id",
41
+ content: "I'm fine, thank you!",
42
+ } satisfies Message;
43
+
44
+ const systemMessage = {
45
+ from: "system",
46
+ id: "system-message-id",
47
+ content: "This is a system message",
48
+ } satisfies Message;
49
+
50
+ const conversationBase = {
51
+ _id: new ObjectId(),
52
+ createdAt: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000),
53
+ updatedAt: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000),
54
+ model: "model-id",
55
+ embeddingModel: "embedding-model-id",
56
+ title: "title",
57
+ messages: [],
58
+ } satisfies Conversation;
59
+
60
+ describe.sequential("Deleting discarded conversations", async () => {
61
+ test("a conversation with no messages should get deleted", async () => {
62
+ await collections.conversations.insertOne({
63
+ ...conversationBase,
64
+ sessionId: sessionForUser.sessionId,
65
+ });
66
+
67
+ const result = await deleteConversations(collections);
68
+
69
+ expect(result).toBe(1);
70
+ });
71
+ test("a conversation with no messages that is less than 1 hour old should not get deleted", async () => {
72
+ await collections.conversations.insertOne({
73
+ ...conversationBase,
74
+ sessionId: sessionForUser.sessionId,
75
+ createdAt: new Date(Date.now() - 30 * 60 * 1000),
76
+ });
77
+
78
+ const result = await deleteConversations(collections);
79
+
80
+ expect(result).toBe(0);
81
+ });
82
+ test("a conversation with only system messages should get deleted", async () => {
83
+ await collections.conversations.insertOne({
84
+ ...conversationBase,
85
+ sessionId: sessionForUser.sessionId,
86
+ messages: [systemMessage],
87
+ });
88
+
89
+ const result = await deleteConversations(collections);
90
+
91
+ expect(result).toBe(1);
92
+ });
93
+ test("a conversation with a user message should not get deleted", async () => {
94
+ await collections.conversations.insertOne({
95
+ ...conversationBase,
96
+ sessionId: sessionForUser.sessionId,
97
+ messages: [userMessage],
98
+ });
99
+
100
+ const result = await deleteConversations(collections);
101
+
102
+ expect(result).toBe(0);
103
+ });
104
+ test("a conversation with an assistant message should not get deleted", async () => {
105
+ await collections.conversations.insertOne({
106
+ ...conversationBase,
107
+ sessionId: sessionForUser.sessionId,
108
+ messages: [assistantMessage],
109
+ });
110
+
111
+ const result = await deleteConversations(collections);
112
+
113
+ expect(result).toBe(0);
114
+ });
115
+ test("a conversation with a mix of messages should not get deleted", async () => {
116
+ await collections.conversations.insertOne({
117
+ ...conversationBase,
118
+ sessionId: sessionForUser.sessionId,
119
+ messages: [systemMessage, userMessage, assistantMessage, userMessage, assistantMessage],
120
+ });
121
+
122
+ const result = await deleteConversations(collections);
123
+
124
+ expect(result).toBe(0);
125
+ });
126
+ test("a conversation with a userId and no sessionId should not get deleted", async () => {
127
+ await collections.conversations.insertOne({
128
+ ...conversationBase,
129
+ messages: [userMessage, assistantMessage],
130
+ userId: userData._id,
131
+ });
132
+
133
+ const result = await deleteConversations(collections);
134
+
135
+ expect(result).toBe(0);
136
+ });
137
+ test("a conversation with no userId or sessionId should get deleted", async () => {
138
+ await collections.conversations.insertOne({
139
+ ...conversationBase,
140
+ messages: [userMessage, assistantMessage],
141
+ });
142
+
143
+ const result = await deleteConversations(collections);
144
+
145
+ expect(result).toBe(1);
146
+ });
147
+ test("a conversation with a sessionId that exists should not get deleted", async () => {
148
+ await collections.conversations.insertOne({
149
+ ...conversationBase,
150
+ messages: [userMessage, assistantMessage],
151
+ sessionId: sessionForUser.sessionId,
152
+ });
153
+
154
+ const result = await deleteConversations(collections);
155
+
156
+ expect(result).toBe(0);
157
+ });
158
+ test("a conversation with a userId and a sessionId that doesn't exist should NOT get deleted", async () => {
159
+ await collections.conversations.insertOne({
160
+ ...conversationBase,
161
+ userId: userData._id,
162
+ messages: [userMessage, assistantMessage],
163
+ sessionId: new ObjectId().toString(),
164
+ });
165
+
166
+ const result = await deleteConversations(collections);
167
+
168
+ expect(result).toBe(0);
169
+ });
170
+ test("a conversation with only a sessionId that doesn't exist, should get deleted", async () => {
171
+ await collections.conversations.insertOne({
172
+ ...conversationBase,
173
+ messages: [userMessage, assistantMessage],
174
+ sessionId: new ObjectId().toString(),
175
+ });
176
+
177
+ const result = await deleteConversations(collections);
178
+
179
+ expect(result).toBe(1);
180
+ });
181
+ test("many conversations should get deleted", async () => {
182
+ const conversations = Array.from({ length: 10010 }, () => ({
183
+ ...conversationBase,
184
+ _id: new ObjectId(),
185
+ }));
186
+
187
+ await collections.conversations.insertMany(conversations);
188
+
189
+ const result = await deleteConversations(collections);
190
+
191
+ expect(result).toBe(10010);
192
+ });
193
+ });
194
+
195
+ beforeAll(async () => {
196
+ await collections.users.insertOne(userData);
197
+ await collections.sessions.insertOne(sessionForUser);
198
+ });
199
+
200
+ afterAll(async () => {
201
+ await collections.users.deleteOne({
202
+ _id: userData._id,
203
+ });
204
+ await collections.sessions.deleteOne({
205
+ _id: sessionForUser._id,
206
+ });
207
+ await collections.conversations.deleteMany({});
208
+ });
209
+
210
+ afterEach(async () => {
211
+ await collections.conversations.deleteMany({
212
+ _id: { $in: [conversationBase._id] },
213
+ });
214
+ });
src/lib/migrations/routines/09-delete-empty-conversations.ts ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { Migration } from ".";
2
+ import { collections } from "$lib/server/database";
3
+ import { Collection, FindCursor, ObjectId } from "mongodb";
4
+ import { logger } from "$lib/server/logger";
5
+ import type { Conversation } from "$lib/types/Conversation";
6
+
7
+ const BATCH_SIZE = 1000;
8
+ const DELETE_THRESHOLD_MS = 60 * 60 * 1000;
9
+
10
+ async function deleteBatch(conversations: Collection<Conversation>, ids: ObjectId[]) {
11
+ if (ids.length === 0) return 0;
12
+ const deleteResult = await conversations.deleteMany({ _id: { $in: ids } });
13
+ return deleteResult.deletedCount;
14
+ }
15
+
16
+ async function processCursor<T>(
17
+ cursor: FindCursor<T>,
18
+ processBatchFn: (batch: T[]) => Promise<void>
19
+ ) {
20
+ let batch = [];
21
+ while (await cursor.hasNext()) {
22
+ const doc = await cursor.next();
23
+ if (doc) {
24
+ batch.push(doc);
25
+ }
26
+ if (batch.length >= BATCH_SIZE) {
27
+ await processBatchFn(batch);
28
+ batch = [];
29
+ }
30
+ }
31
+ if (batch.length > 0) {
32
+ await processBatchFn(batch);
33
+ }
34
+ }
35
+
36
+ export async function deleteConversations(
37
+ collections: typeof import("$lib/server/database").collections
38
+ ) {
39
+ let deleteCount = 0;
40
+ const { conversations, sessions } = collections;
41
+
42
+ // First criteria: Delete conversations with no user/assistant messages older than 1 hour
43
+ const emptyConvCursor = conversations
44
+ .find({
45
+ "messages.from": { $not: { $in: ["user", "assistant"] } },
46
+ createdAt: { $lt: new Date(Date.now() - DELETE_THRESHOLD_MS) },
47
+ })
48
+ .batchSize(BATCH_SIZE);
49
+
50
+ await processCursor(emptyConvCursor, async (batch) => {
51
+ const ids = batch.map((doc) => doc._id);
52
+ deleteCount += await deleteBatch(conversations, ids);
53
+ });
54
+
55
+ // Second criteria: Process conversations without users in batches and check sessions
56
+ const noUserCursor = conversations.find({ userId: { $exists: false } }).batchSize(BATCH_SIZE);
57
+
58
+ await processCursor(noUserCursor, async (batch) => {
59
+ const sessionIds = [
60
+ ...new Set(batch.map((conv) => conv.sessionId).filter((id): id is string => !!id)),
61
+ ];
62
+
63
+ const existingSessions = await sessions.find({ sessionId: { $in: sessionIds } }).toArray();
64
+ const validSessionIds = new Set(existingSessions.map((s) => s.sessionId));
65
+
66
+ const invalidConvs = batch.filter(
67
+ (conv) => !conv.sessionId || !validSessionIds.has(conv.sessionId)
68
+ );
69
+ const idsToDelete = invalidConvs.map((conv) => conv._id);
70
+ deleteCount += await deleteBatch(conversations, idsToDelete);
71
+ });
72
+
73
+ logger.info(`[MIGRATIONS] Deleted ${deleteCount} conversations in total.`);
74
+ return deleteCount;
75
+ }
76
+
77
+ const deleteEmptyConversations: Migration = {
78
+ _id: new ObjectId("000000000000000000000009"),
79
+ name: "Delete conversations with no user or assistant messages or valid sessions",
80
+ up: async () => {
81
+ await deleteConversations(collections);
82
+ return true;
83
+ },
84
+ runEveryTime: true,
85
+ runForHuggingChat: "only",
86
+ };
87
+
88
+ export default deleteEmptyConversations;
src/lib/migrations/routines/index.ts CHANGED
@@ -9,7 +9,7 @@ import updateMessageFiles from "./05-update-message-files";
9
  import trimMessageUpdates from "./06-trim-message-updates";
10
  import resetTools from "./07-reset-tools-in-settings";
11
  import updateFeaturedToReview from "./08-update-featured-to-review";
12
-
13
  export interface Migration {
14
  _id: ObjectId;
15
  name: string;
@@ -29,4 +29,5 @@ export const migrations: Migration[] = [
29
  trimMessageUpdates,
30
  resetTools,
31
  updateFeaturedToReview,
 
32
  ];
 
9
  import trimMessageUpdates from "./06-trim-message-updates";
10
  import resetTools from "./07-reset-tools-in-settings";
11
  import updateFeaturedToReview from "./08-update-featured-to-review";
12
+ import deleteEmptyConversations from "./09-delete-empty-conversations";
13
  export interface Migration {
14
  _id: ObjectId;
15
  name: string;
 
29
  trimMessageUpdates,
30
  resetTools,
31
  updateFeaturedToReview,
32
+ deleteEmptyConversations,
33
  ];
src/lib/server/database.ts CHANGED
@@ -228,6 +228,20 @@ export class Database {
228
  tools.createIndex({ createdById: 1, userCount: -1 }).catch((e) => logger.error(e));
229
  tools.createIndex({ userCount: 1 }).catch((e) => logger.error(e));
230
  tools.createIndex({ last24HoursCount: 1 }).catch((e) => logger.error(e));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  }
232
  }
233
 
 
228
  tools.createIndex({ createdById: 1, userCount: -1 }).catch((e) => logger.error(e));
229
  tools.createIndex({ userCount: 1 }).catch((e) => logger.error(e));
230
  tools.createIndex({ last24HoursCount: 1 }).catch((e) => logger.error(e));
231
+
232
+ conversations
233
+ .createIndex({
234
+ "messages.from": 1,
235
+ createdAt: 1,
236
+ })
237
+ .catch((e) => logger.error(e));
238
+
239
+ conversations
240
+ .createIndex({
241
+ userId: 1,
242
+ sessionId: 1,
243
+ })
244
+ .catch((e) => logger.error(e));
245
  }
246
  }
247
 
vite.config.ts CHANGED
@@ -41,5 +41,6 @@ export default defineConfig({
41
  setupFiles: ["./scripts/setupTest.ts"],
42
  deps: { inline: ["@sveltejs/kit"] },
43
  globals: true,
 
44
  },
45
  });
 
41
  setupFiles: ["./scripts/setupTest.ts"],
42
  deps: { inline: ["@sveltejs/kit"] },
43
  globals: true,
44
+ testTimeout: 10000,
45
  },
46
  });