gavinzli commited on
Commit
9bc2641
Β·
1 Parent(s): 39fe3d1

chore: Update patterns.json with additional split configurations

Browse files
Files changed (1) hide show
  1. patterns.json +78 -12
patterns.json CHANGED
@@ -275,7 +275,13 @@
275
  "keyword": "η›Έε…³η ”η©Ά",
276
  "article_regex": "γ€Š(.*?)》",
277
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
278
- "date_format": "%Y.%m.%d"
 
 
 
 
 
 
279
  },
280
  {
281
  "site": "Yongxing Securities Co., Ltd.",
@@ -293,16 +299,28 @@
293
  "keyword": "η›Έε…³η ”η©Ά",
294
  "article_regex": "γ€Š(.*?)》",
295
  "date_regex": "(d{4}\\s/\\d{2}/\\d{2}) ",
296
- "date_format": "(%Y/%m/%d) "
 
 
 
 
 
 
297
  },
298
  {
299
  "site": "Hualong Securities Co., Ltd.",
300
  "pages": [0],
301
- "date_range": 1,
302
  "keyword": "η›Έε…³ι˜…θ―»",
303
  "article_regex": "γ€Š(.*?)》",
304
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
305
- "date_format": "%Y.%m.%d"
 
 
 
 
 
 
306
  },
307
  {
308
  "site": "Hebei Yuanda Information Technology Co., Ltd.",
@@ -311,7 +329,13 @@
311
  "keyword": "η›Έε…³ζŠ₯ε‘ŠοΌš",
312
  "article_regex": "γ€Š(.*?)》",
313
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
314
- "date_format": "%Y.%m.%d"
 
 
 
 
 
 
315
  },
316
  {
317
  "site": "Huaxin Securities Co., Ltd.",
@@ -329,7 +353,13 @@
329
  "keyword": "1.",
330
  "article_regex": "γ€Š(.*?)》",
331
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
332
- "date_format": "%Y.%m.%d"
 
 
 
 
 
 
333
  },
334
  {
335
  "site": "Beijing Tengjing Big Data Application Technology Research Institute",
@@ -338,7 +368,13 @@
338
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
339
  "article_regex": "γ€Š(.*?)》",
340
  "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
341
- "date_format": "%Y-%m-%d"
 
 
 
 
 
 
342
  },
343
  {
344
  "site": "Wanhe Securities Co., Ltd.",
@@ -347,7 +383,13 @@
347
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
348
  "article_regex": "γ€Š(.*?)》",
349
  "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
350
- "date_format": "%Y-%m-%d"
 
 
 
 
 
 
351
  },
352
  {
353
  "site": "Centaline Securities Co., Ltd.",
@@ -356,7 +398,13 @@
356
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
357
  "article_regex": "γ€Š(.*?)》",
358
  "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
359
- "date_format": "%Y-%m-%d"
 
 
 
 
 
 
360
  },
361
  {
362
  "site": "Tengjing Digital Research",
@@ -365,7 +413,13 @@
365
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
366
  "article_regex": "γ€Š(.*?)》",
367
  "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
368
- "date_format": "%Y-%m-%d"
 
 
 
 
 
 
369
  },
370
  {
371
  "site": "Guoyuan Securities",
@@ -374,7 +428,13 @@
374
  "keyword": "η›Έε…³η ”η©ΆζŠ₯ε‘Š",
375
  "article_regex": "γ€Š(.*?)》",
376
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
377
- "date_format": "%Y.%m.%d"
 
 
 
 
 
 
378
  },
379
  {
380
  "site": "China Galaxy Co., Ltd.",
@@ -392,7 +452,13 @@
392
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
393
  "article_regex": "γ€Š(.*?)》",
394
  "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
395
- "date_format": "%Y-%m-%d"
 
 
 
 
 
 
396
  },
397
  {
398
  "site": "SDIC Anxin Futures",
 
275
  "keyword": "η›Έε…³η ”η©Ά",
276
  "article_regex": "γ€Š(.*?)》",
277
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
278
+ "date_format": "%Y.%m.%d",
279
+ "split":[
280
+ {
281
+ "string": "β€”β€”",
282
+ "index": 0
283
+ }
284
+ ]
285
  },
286
  {
287
  "site": "Yongxing Securities Co., Ltd.",
 
299
  "keyword": "η›Έε…³η ”η©Ά",
300
  "article_regex": "γ€Š(.*?)》",
301
  "date_regex": "(d{4}\\s/\\d{2}/\\d{2}) ",
302
+ "date_format": "(%Y/%m/%d) ",
303
+ "split":[
304
+ {
305
+ "string": "β€”β€”",
306
+ "index": 0
307
+ }
308
+ ]
309
  },
310
  {
311
  "site": "Hualong Securities Co., Ltd.",
312
  "pages": [0],
313
+ "date_range": 5,
314
  "keyword": "η›Έε…³ι˜…θ―»",
315
  "article_regex": "γ€Š(.*?)》",
316
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
317
+ "date_format": "%Y.%m.%d",
318
+ "split":[
319
+ {
320
+ "string": "β€”β€”",
321
+ "index": 0
322
+ }
323
+ ]
324
  },
325
  {
326
  "site": "Hebei Yuanda Information Technology Co., Ltd.",
 
329
  "keyword": "η›Έε…³ζŠ₯ε‘ŠοΌš",
330
  "article_regex": "γ€Š(.*?)》",
331
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
332
+ "date_format": "%Y.%m.%d",
333
+ "split":[
334
+ {
335
+ "string": ":",
336
+ "index": -1
337
+ }
338
+ ]
339
  },
340
  {
341
  "site": "Huaxin Securities Co., Ltd.",
 
353
  "keyword": "1.",
354
  "article_regex": "γ€Š(.*?)》",
355
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
356
+ "date_format": "%Y.%m.%d",
357
+ "split":[
358
+ {
359
+ "string": "β€”β€”",
360
+ "index": 0
361
+ }
362
+ ]
363
  },
364
  {
365
  "site": "Beijing Tengjing Big Data Application Technology Research Institute",
 
368
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
369
  "article_regex": "γ€Š(.*?)》",
370
  "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
371
+ "date_format": "%Y-%m-%d",
372
+ "split":[
373
+ {
374
+ "string": ":",
375
+ "index": -1
376
+ }
377
+ ]
378
  },
379
  {
380
  "site": "Wanhe Securities Co., Ltd.",
 
383
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
384
  "article_regex": "γ€Š(.*?)》",
385
  "date_regex": "(\\d{4}-\\d{2}-\\d{2})",
386
+ "date_format": "%Y-%m-%d",
387
+ "split":[
388
+ {
389
+ "string": "-",
390
+ "index": -1
391
+ }
392
+ ]
393
  },
394
  {
395
  "site": "Centaline Securities Co., Ltd.",
 
398
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
399
  "article_regex": "γ€Š(.*?)》",
400
  "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
401
+ "date_format": "%Y-%m-%d",
402
+ "split":[
403
+ {
404
+ "string": ":",
405
+ "index": -1
406
+ }
407
+ ]
408
  },
409
  {
410
  "site": "Tengjing Digital Research",
 
413
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
414
  "article_regex": "γ€Š(.*?)》",
415
  "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
416
+ "date_format": "%Y-%m-%d",
417
+ "split":[
418
+ {
419
+ "string": ":",
420
+ "index": -1
421
+ }
422
+ ]
423
  },
424
  {
425
  "site": "Guoyuan Securities",
 
428
  "keyword": "η›Έε…³η ”η©ΆζŠ₯ε‘Š",
429
  "article_regex": "γ€Š(.*?)》",
430
  "date_regex": "\\d{4}\\s?.\\s?\\d{1,2}\\s?.\\s?\\d{1,2}",
431
+ "date_format": "%Y.%m.%d",
432
+ "split":[
433
+ {
434
+ "string": ":",
435
+ "index": -1
436
+ }
437
+ ]
438
  },
439
  {
440
  "site": "China Galaxy Co., Ltd.",
 
452
  "keyword": "η›Έε…³ζŠ₯ε‘Š",
453
  "article_regex": "γ€Š(.*?)》",
454
  "date_regex": "(\\d{4}\\s?-\\s?\\d{2}\\s?-\\s?\\d{2})",
455
+ "date_format": "%Y-%m-%d",
456
+ "split":[
457
+ {
458
+ "string": ":",
459
+ "index": 0
460
+ }
461
+ ]
462
  },
463
  {
464
  "site": "SDIC Anxin Futures",