yinuozhang commited on
Commit
bcae4e2
·
1 Parent(s): 22efa51
Files changed (1) hide show
  1. app.py +68 -200
app.py CHANGED
@@ -1,13 +1,11 @@
1
  import os
2
  import gradio_client.utils as client_utils
3
-
4
  _original = client_utils._json_schema_to_python_type
5
  def _safe_json_schema_to_python_type(schema, defs=None):
6
  if isinstance(schema, bool):
7
  return "Any"
8
  return _original(schema, defs)
9
-
10
- # Override both entry points
11
  client_utils._json_schema_to_python_type = _safe_json_schema_to_python_type
12
  client_utils.json_schema_to_python_type = _safe_json_schema_to_python_type
13
  import gradio as gr
@@ -37,19 +35,18 @@ class PeptideAnalyzer:
37
  (r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
38
  ]
39
  self.complex_residue_patterns = [
40
- # Kpg - Lys(palmitoyl-Glu-OtBu) - Exact pattern for the specific structure
41
  (r'\[C[@]H\]\(CCCNC\(=O\)CCC\[C@@H\]\(NC\(=O\)CCCCCCCCCCCCCCCC\)C\(=O\)OC\(C\)\(C\)C\)', 'Kpg'),
42
  (r'CCCCCCCCCCCCCCCCC\(=O\)N\[C@H\]\(CCCC\(=O\)NCCC\[C@@H\]', 'Kpg'),
43
  (r'\[C@*H\]\(CSC\(c\d+ccccc\d+\)\(c\d+ccccc\d+\)c\d+ccc\(OC\)cc\d+\)', 'Cmt'),
44
- (r'CSC\(c.*?c.*?OC\)', 'Cmt'), # Core structure of Cys-Mmt group
45
- (r'COc.*?ccc\(C\(SC', 'Cmt'), # Start of Cmt in cyclic peptides
46
- (r'c2ccccc2\)c2ccccc2\)cc', 'Cmt'), # End of Cmt in cyclic peptides
47
- # Glu(OAll) - Only match the complete pattern to avoid partial matches
48
  (r'C=CCOC\(=O\)CC\[C@@H\]', 'Eal'),
49
  (r'\(C\)OP\(=O\)\(O\)OCc\d+ccccc\d+', 'Tpb'),
50
  #(r'COc\d+ccc\(C\(SC\[C@@H\]\d+.*?\)\(c\d+ccccc\d+\)c\d+ccccc\d+\)cc\d+', 'Cmt-cyclic'),
51
 
52
- # Dtg - Asp(OtBu)-(Dmb)Gly - Full pattern
53
  (r'CN\(Cc\d+ccc\(OC\)cc\d+OC\)C\(=O\)\[C@H\]\(CC\(=O\)OC\(C\)\(C\)C\)', 'Dtg'),
54
  (r'C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
55
  (r'N\[C@@H\]\(CC\(=O\)OC\(C\)\(C\)C\)C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
@@ -71,13 +68,10 @@ class PeptideAnalyzer:
71
  }
72
  def preprocess_complex_residues(self, smiles):
73
  """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
74
- # Create a mapping of positions to complex residue types
75
  complex_positions = []
76
 
77
- # Search for all complex residue patterns
78
  for pattern, residue_type in self.complex_residue_patterns:
79
  for match in re.finditer(pattern, smiles):
80
- # Only add if this position doesn't overlap with existing matches
81
  if not any(pos['start'] <= match.start() < pos['end'] or
82
  pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
83
  complex_positions.append({
@@ -87,56 +81,44 @@ class PeptideAnalyzer:
87
  'pattern': match.group()
88
  })
89
 
90
- # Sort by position (to handle potential overlapping matches)
91
  complex_positions.sort(key=lambda x: x['start'])
92
 
93
- # If no complex residues found, return original SMILES
94
  if not complex_positions:
95
  return smiles, []
96
 
97
- # Build a new SMILES string, protecting complex residues
98
  preprocessed_smiles = smiles
99
- offset = 0 # Track offset from replacements
100
 
101
  protected_residues = []
102
 
103
  for pos in complex_positions:
104
- # Adjust positions based on previous replacements
105
  start = pos['start'] + offset
106
  end = pos['end'] + offset
107
 
108
- # Extract the complex residue part
109
  complex_part = preprocessed_smiles[start:end]
110
 
111
- # Verify this is a complete residue (should have proper amino acid structure)
112
  if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
113
- continue # Skip if not a proper amino acid structure
114
 
115
- # Create a placeholder for this complex residue
116
  placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
117
 
118
- # Replace the complex part with the placeholder
119
  preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
120
 
121
- # Track the offset change
122
  offset += len(placeholder) - (end - start)
123
 
124
- # Store the residue information
125
  protected_residues.append({
126
  'placeholder': placeholder,
127
  'type': pos['type'],
128
  'content': complex_part
129
  })
130
-
131
- #print(f"Protected {pos['type']}: {complex_part[:20]}... as {placeholder}")
132
-
133
  return preprocessed_smiles, protected_residues
134
  def split_on_bonds(self, smiles, protected_residues=None):
135
  """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
136
  positions = []
137
  used = set()
138
 
139
- # First, handle protected complex residues if any
140
  if protected_residues:
141
  for residue in protected_residues:
142
  match = re.search(residue['placeholder'], smiles)
@@ -166,7 +148,6 @@ class PeptideAnalyzer:
166
  })
167
  used.update(range(match.start(), match.end()))
168
 
169
- # Then find all other bonds
170
  for pattern, bond_type in self.bond_patterns:
171
  for match in re.finditer(pattern, smiles):
172
  if not any(p in range(match.start(), match.end()) for p in used):
@@ -178,17 +159,13 @@ class PeptideAnalyzer:
178
  })
179
  used.update(range(match.start(), match.end()))
180
 
181
- # Sort all positions
182
  bond_positions.sort(key=lambda x: x['start'])
183
 
184
- # Combine complex residue positions and bond positions
185
  all_positions = positions + bond_positions
186
  all_positions.sort(key=lambda x: x['start'])
187
 
188
- # Create segments
189
  segments = []
190
 
191
- # First segment (if not starting with a bond or complex residue)
192
  if all_positions and all_positions[0]['start'] > 0:
193
  segments.append({
194
  'content': smiles[0:all_positions[0]['start']],
@@ -196,12 +173,10 @@ class PeptideAnalyzer:
196
  'complex_after': all_positions[0]['pattern'] if all_positions[0]['type'] == 'complex' else None
197
  })
198
 
199
- # Process segments between positions
200
  for i in range(len(all_positions)-1):
201
  current = all_positions[i]
202
  next_pos = all_positions[i+1]
203
 
204
- # Handle complex residues
205
  if current['type'] == 'complex':
206
  segments.append({
207
  'content': current['content'],
@@ -209,7 +184,6 @@ class PeptideAnalyzer:
209
  'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None,
210
  'complex_type': current['residue_type']
211
  })
212
- # Handle regular bonds
213
  elif current['type'] == 'gly':
214
  segments.append({
215
  'content': 'NCC(=O)',
@@ -217,7 +191,6 @@ class PeptideAnalyzer:
217
  'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
218
  })
219
  else:
220
- # Only create segment if there's content between this bond and next position
221
  content = smiles[current['end']:next_pos['start']]
222
  if content and next_pos['type'] != 'complex':
223
  segments.append({
@@ -268,14 +241,13 @@ class PeptideAnalyzer:
268
  # Find all numbers used in ring closures
269
  ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
270
 
271
- # Find aromatic ring numbers
272
  aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
273
  aromatic_cycles = []
274
  for match in aromatic_matches:
275
  numbers = re.findall(r'[0-9]', match)
276
  aromatic_cycles.extend(numbers)
277
 
278
- # Numbers that aren't part of aromatic rings are peptide cycles
279
  peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
280
 
281
  is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
@@ -309,17 +281,15 @@ class PeptideAnalyzer:
309
  print("DIRECT MATCH: Found Cmt at beginning")
310
  return 'Cmt', mods
311
 
312
- # VERY EXPLICIT check for the last segment in your example
313
  if '[C@@H]3CCCN3C2=O)(c2ccccc2)c2ccccc2)cc' in content:
314
  print("DIRECT MATCH: Found Pro at end")
315
  return 'Pro', mods
316
- # === Original amino acid patterns ===
317
  # Eal - Glu(OAll) - Multiple patterns
318
  if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
319
  return 'Eal', mods
320
- # Proline (P) - flexible ring numbers
321
  if any([
322
- # Check for any ring number in bond patterns
323
  (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
324
  any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
325
  for n in '123456789'
@@ -327,12 +297,11 @@ class PeptideAnalyzer:
327
  any(f'CCC{n}' for n in '123456789'))
328
  for n in '123456789'
329
  ]) or any([
330
- # Check ending patterns with any ring number
331
  (f'CCCN{n}' in content and content.endswith('=O') and
332
  any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
333
  for n in '123456789'
334
  ]) or any([
335
- # Handle CCC[C@H]n patterns
336
  (content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
337
  (content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
338
  # N-terminal Pro with any ring number
@@ -349,35 +318,29 @@ class PeptideAnalyzer:
349
  # Tryptophan (W) - more specific indole pattern
350
  if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
351
  'c[nH]c' in content.replace(' ', ''):
352
- # Check stereochemistry for D/L
353
  if '[C@H](CC' in content: # D-form
354
  return 'trp', mods
355
  return 'Trp', mods
356
 
357
  # Lysine (K) - both patterns
358
  if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
359
- # Check stereochemistry for D/L
360
  if '[C@H](CCCCN)' in content: # D-form
361
  return 'lys', mods
362
  return 'Lys', mods
363
 
364
  # Arginine (R) - both patterns
365
  if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
366
- # Check stereochemistry for D/L
367
  if '[C@H](CCCNC(=N)N)' in content: # D-form
368
  return 'arg', mods
369
  return 'Arg', mods
370
 
371
- # Regular residue identification
372
  if content == 'C' and segment.get('bond_before') and segment.get('bond_after'):
373
- # If it's surrounded by peptide bonds, it's almost certainly Gly
374
  if ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
375
  ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
376
  return 'Gly', mods
377
 
378
- # Case 2: Cyclic terminal glycine - typically contains 'CNC' with ring closure
379
  if 'CNC' in content and any(f'C{i}=' in content for i in range(1, 10)):
380
- return 'Gly', mods # This will catch patterns like 'CNC1=O'
381
  if not segment.get('bond_before') and segment.get('bond_after'):
382
  if content == 'C' or content == 'NC':
383
  if ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
@@ -385,14 +348,12 @@ class PeptideAnalyzer:
385
 
386
  # Leucine patterns (L/l)
387
  if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
388
- # Check stereochemistry for D/L
389
  if '[C@H](CC(C)C)' in content or 'CC(C)C[C@H]' in content: # D-form
390
  return 'leu', mods
391
  return 'Leu', mods
392
 
393
  # Threonine patterns (T/t)
394
  if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H]([C@H](C)O)' in content or '[C@H]([C@@H](C)O)' in content:
395
- # Check both stereochemistry patterns
396
  if '[C@H]([C@@H](C)O)' in content: # D-form
397
  return 'thr', mods
398
  return 'Thr', mods
@@ -402,7 +363,6 @@ class PeptideAnalyzer:
402
 
403
  # Phenylalanine patterns (F/f)
404
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
405
- # Check stereochemistry for D/L
406
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content): # D-form
407
  return 'phe', mods
408
  return 'Phe', mods
@@ -411,15 +371,12 @@ class PeptideAnalyzer:
411
  '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
412
  'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
413
 
414
- # Make sure it's not leucine
415
  if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
416
- # Check stereochemistry
417
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
418
  return 'val', mods
419
  return 'Val', mods
420
 
421
  # Isoleucine patterns (I/i)
422
- # First check for various isoleucine patterns while excluding valine
423
  if (any(['CC[C@@H](C)' in content, '[C@@H](C)CC' in content, '[C@@H](CC)C' in content,
424
  'C(C)C[C@@H]' in content, '[C@@H]([C@H](C)CC)' in content, '[C@H]([C@@H](C)CC)' in content,
425
  '[C@@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
@@ -429,30 +386,26 @@ class PeptideAnalyzer:
429
  'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
430
  and 'CC(C)C' not in content): # Exclude valine pattern
431
 
432
- # Check stereochemistry for D/L forms
433
  if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
434
  '[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
435
  'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
436
  'CC[C@@H](C)[C@H]' in content, 'CC[C@H](C)[C@H]' in content]):
437
  # D-form
438
  return 'ile', mods
439
- # All other stereochemistries are treated as L-form
440
  return 'Ile', mods
441
- # Tpb - Thr(PO(OBzl)OH) - Multiple patterns
442
  if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
443
  return 'Tpb', mods
444
 
445
  # Alanine patterns (A/a)
446
  if ('[C@H](C)' in content or '[C@@H](C)' in content):
447
  if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
448
- # Check stereochemistry for D/L
449
  if '[C@H](C)' in content: # D-form
450
  return 'ala', mods
451
  return 'Ala', mods
452
 
453
  # Tyrosine patterns (Y/y)
454
  if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
455
- # Check stereochemistry for D/L
456
  if '[C@H](Cc1ccc(O)cc1)' in content: # D-form
457
  return 'tyr', mods
458
  return 'Tyr', mods
@@ -460,25 +413,24 @@ class PeptideAnalyzer:
460
  # Serine patterns (S/s)
461
  if '[C@H](CO)' in content or '[C@@H](CO)' in content:
462
  if not ('C(C)O' in content or 'COC' in content):
463
- # Check stereochemistry for D/L
464
  if '[C@H](CO)' in content: # D-form
465
  return 'ser', mods
466
  return 'Ser', mods
467
 
468
  if 'CSSC' in content:
469
- # Check for various cysteine-cysteine bridge patterns
470
  if re.search(r'\[C@@H\].*CSSC.*\[C@@H\]', content) or re.search(r'\[C@H\].*CSSC.*\[C@H\]', content):
471
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
472
  return 'cys-cys', mods
473
  return 'Cys-Cys', mods
474
 
475
- # Pattern for cysteine with N-terminal amine group
476
  if '[C@@H](N)CSSC' in content or '[C@H](N)CSSC' in content:
477
  if '[C@H](N)CSSC' in content: # D-form
478
  return 'cys-cys', mods
479
  return 'Cys-Cys', mods
480
 
481
- # Pattern for cysteine with C-terminal carboxyl
482
  if 'CSSC[C@@H](C(=O)O)' in content or 'CSSC[C@H](C(=O)O)' in content:
483
  if 'CSSC[C@H](C(=O)O)' in content: # D-form
484
  return 'cys-cys', mods
@@ -486,14 +438,12 @@ class PeptideAnalyzer:
486
 
487
  # Cysteine patterns (C/c)
488
  if '[C@H](CS)' in content or '[C@@H](CS)' in content:
489
- # Check stereochemistry for D/L
490
  if '[C@H](CS)' in content: # D-form
491
  return 'cys', mods
492
  return 'Cys', mods
493
 
494
  # Methionine patterns (M/m)
495
  if ('CCSC' in content) or ("CSCC" in content):
496
- # Check stereochemistry for D/L
497
  if '[C@H](CCSC)' in content: # D-form
498
  return 'met', mods
499
  elif '[C@H]' in content:
@@ -502,34 +452,29 @@ class PeptideAnalyzer:
502
 
503
  # Glutamine patterns (Q/q)
504
  if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
505
- # Check stereochemistry for D/L
506
  if '[C@H](CCC(=O)N)' in content: # D-form
507
  return 'gln', mods
508
  return 'Gln', mods
509
 
510
  # Asparagine patterns (N/n)
511
  if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
512
- # Check stereochemistry for D/L
513
  if '[C@H](CC(=O)N)' in content: # D-form
514
  return 'asn', mods
515
  return 'Asn', mods
516
 
517
  # Glutamic acid patterns (E/e)
518
  if ('CCC(=O)O' in content):
519
- # Check stereochemistry for D/L
520
  if '[C@H](CCC(=O)O)' in content: # D-form
521
  return 'glu', mods
522
  return 'Glu', mods
523
 
524
  # Aspartic acid patterns (D/d)
525
  if ('CC(=O)O' in content):
526
- # Check stereochemistry for D/L
527
  if '[C@H](CC(=O)O)' in content: # D-form
528
  return 'asp', mods
529
  return 'Asp', mods
530
 
531
  if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
532
- # Check stereochemistry for D/L
533
  if '[C@H]' in content: # D-form
534
  return 'his', mods
535
  return 'His', mods
@@ -539,29 +484,26 @@ class PeptideAnalyzer:
539
  if ('N[C@@H](CCCC)' in content or '[C@@H](CCCC)' in content or 'CCCC[C@@H]' in content or
540
  'N[C@H](CCCC)' in content or '[C@H](CCCC)' in content) and 'CC(C)' not in content:
541
  return 'Nle', mods
542
- # Aib - alpha-aminoisobutyric acid (2-aminoisobutyric acid)
543
- # More flexible pattern detection
544
  if 'C(C)(C)(N)' in content:
545
  return 'Aib', mods
546
 
547
- # Partial Aib pattern but NOT part of t-butyl ester
548
  if 'C(C)(C)' in content and 'OC(C)(C)C' not in content:
549
  if (segment.get('bond_before') and segment.get('bond_after') and
550
  any(bond in segment['bond_before'] for bond in ['C(=O)N', 'NC(=O)', 'N(C)C(=O)']) and
551
  any(bond in segment['bond_after'] for bond in ['NC(=O)', 'C(=O)N', 'N(C)C(=O)'])):
552
  return 'Aib', mods
553
 
554
- # Dtg - Asp(OtBu)-(Dmb)Gly - Simplified pattern for better detection
555
  if 'CC(=O)OC(C)(C)C' in content and 'CC1=C(C=C(C=C1)OC)OC' in content:
556
  return 'Dtg', mods
557
 
558
 
559
- # Kpg - Lys(palmitoyl-Glu-OtBu) - Simplified pattern
560
  if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
561
  return 'Kpg', mods
562
 
563
 
564
-
565
  return None, mods
566
 
567
  def get_modifications(self, segment):
@@ -582,67 +524,45 @@ class PeptideAnalyzer:
582
 
583
  return mods
584
 
585
- def analyze_structure(self, smiles):
586
- """Main analysis function with preprocessing for complex residues"""
587
- #print("\nAnalyzing structure:", smiles)
588
-
589
- # Pre-process to identify complex residues first
590
  preprocessed_smiles, protected_residues = self.preprocess_complex_residues(smiles)
591
- """
592
- if protected_residues:
593
- print(f"Identified {len(protected_residues)} complex residues during pre-processing")
594
- for i, residue in enumerate(protected_residues):
595
- print(f"Complex residue {i+1}: {residue['type']}")
596
- """
597
-
598
- # Check if it's cyclic
599
  is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
600
 
601
- # Split into segments, respecting protected residues
602
  segments = self.split_on_bonds(preprocessed_smiles, protected_residues)
603
 
604
- #print("\nSegment Analysis:")
605
  sequence = []
606
  for i, segment in enumerate(segments):
607
- """
608
- print(f"\nSegment {i}:")
609
- print(f"Content: {segment.get('content', 'None')}")
610
- print(f"Bond before: {segment.get('bond_before', 'None')}")
611
- print(f"Bond after: {segment.get('bond_after', 'None')}")
612
- """
613
  residue, mods = self.identify_residue(segment)
614
  if residue:
615
  if mods:
616
  sequence.append(f"{residue}({','.join(mods)})")
617
  else:
618
  sequence.append(residue)
619
-
620
- #print(f"Identified as: {residue}")
621
- #print(f"Modifications: {mods}")
622
  else:
623
- print(f"Warning: Could not identify residue in segment: {segment.get('content', 'None')}")
624
 
625
- # Format the sequence
626
  three_letter = '-'.join(sequence)
627
 
628
- # Use the mapping to create one-letter code
629
  one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
630
 
631
  if is_cyclic:
632
  three_letter = f"cyclo({three_letter})"
633
  one_letter = f"cyclo({one_letter})"
634
- """
635
- print(f"\nFinal sequence: {three_letter}")
636
- print(f"One-letter code: {one_letter}")
637
- print(f"Is cyclic: {is_cyclic}")
638
- print(f"Peptide cycles: {peptide_cycles}")
639
- print(f"Aromatic cycles: {aromatic_cycles}")
640
- """
641
  return {
642
  'three_letter': three_letter,
643
  'one_letter': one_letter,
644
  'is_cyclic': is_cyclic,
645
- 'residues': sequence
 
646
  }
647
 
648
  def annotate_cyclic_structure(mol, sequence):
@@ -651,12 +571,10 @@ def annotate_cyclic_structure(mol, sequence):
651
 
652
  drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000)
653
 
654
- # Draw molecule first
655
  drawer.drawOptions().addAtomIndices = False
656
  drawer.DrawMolecule(mol)
657
  drawer.FinishDrawing()
658
 
659
- # Convert to PIL Image
660
  img = Image.open(BytesIO(drawer.GetDrawingText()))
661
  draw = ImageDraw.Draw(img)
662
  try:
@@ -668,7 +586,6 @@ def annotate_cyclic_structure(mol, sequence):
668
  print("Warning: TrueType fonts not available, using default font")
669
  small_font = ImageFont.load_default()
670
 
671
- # Header
672
  seq_text = f"Sequence: {sequence}"
673
  bbox = draw.textbbox((1000, 100), seq_text, font=small_font)
674
  padding = 10
@@ -751,7 +668,6 @@ def create_enhanced_linear_viz(sequence, smiles):
751
  text += f" ({', '.join(mods)})"
752
  color = 'blue'
753
  else:
754
- # Must be a bond
755
  text = f"Bond {i}: "
756
  if 'O-linked' in segment.get('bond_after', ''):
757
  text += "ester"
@@ -893,7 +809,7 @@ class PeptideStructureGenerator:
893
  def process_input(
894
  smiles_input=None,
895
  file_obj=None,
896
- show_linear=False,
897
  show_segment_details=False,
898
  generate_3d=False,
899
  use_uff=False
@@ -946,60 +862,22 @@ def process_input(
946
  except Exception as e:
947
  return f"Error generating 3D structures: {str(e)}", None, None, []
948
 
949
- analysis = analyzer.analyze_structure(smiles)
950
  three_letter = analysis['three_letter']
951
  one_letter = analysis['one_letter']
952
  is_cyclic = analysis['is_cyclic']
953
-
954
- # Only include segment analysis in output if requested
955
- if show_segment_details:
956
- segments = analyzer.split_on_bonds(smiles)
957
-
958
- sequence_parts = []
959
- output_text = ""
960
- output_text += "Segment Analysis:\n"
961
- for i, segment in enumerate(segments):
962
- output_text += f"\nSegment {i}:\n"
963
- output_text += f"Content: {segment['content']}\n"
964
- output_text += f"Bond before: {segment.get('bond_before', 'None')}\n"
965
- output_text += f"Bond after: {segment.get('bond_after', 'None')}\n"
966
-
967
- residue, mods = analyzer.identify_residue(segment)
968
- if residue:
969
- if mods:
970
- sequence_parts.append(f"{residue}({','.join(mods)})")
971
- else:
972
- sequence_parts.append(residue)
973
- output_text += f"Identified as: {residue}\n"
974
- output_text += f"Modifications: {mods}\n"
975
- else:
976
- output_text += f"Warning: Could not identify residue in segment: {segment['content']}\n"
977
- output_text += "\n"
978
- is_cyclic, peptide_cycles, aromatic_cycles = analyzer.is_cyclic(smiles)
979
- three_letter = '-'.join(sequence_parts)
980
- one_letter = ''.join(analyzer.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence_parts)
981
- else:
982
- pass
983
 
984
  img_cyclic = annotate_cyclic_structure(mol, three_letter)
985
-
986
- # Create linear representation if requested
987
- img_linear = None
988
- if show_linear:
989
- fig_linear = create_enhanced_linear_viz(three_letter, smiles)
990
- buf = BytesIO()
991
- fig_linear.savefig(buf, format='png', bbox_inches='tight', dpi=300)
992
- buf.seek(0)
993
- img_linear = Image.open(buf)
994
- plt.close(fig_linear)
995
 
 
 
 
 
996
  summary = "Summary:\n"
997
  summary += f"Sequence: {three_letter}\n"
998
  summary += f"One-letter code: {one_letter}\n"
999
  summary += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
1000
- #if is_cyclic:
1001
- #summary += f"Peptide Cycles: {', '.join(peptide_cycles)}\n"
1002
- #summary += f"Aromatic Cycles: {', '.join(aromatic_cycles)}\n"
1003
 
1004
  if structure_files:
1005
  summary += "\n3D Structures Generated:\n"
@@ -1007,11 +885,11 @@ def process_input(
1007
  summary += f"- {os.path.basename(filepath)}\n"
1008
 
1009
  #return summary, img_cyclic, img_linear, structure_files if structure_files else None
1010
- return summary, img_cyclic
1011
 
1012
  except Exception as e:
1013
  #return f"Error processing SMILES: {str(e)}", None, None, []
1014
- return f"Error processing SMILES: {str(e)}", None
1015
  # Handle file input
1016
  if file_obj is not None:
1017
  try:
@@ -1032,7 +910,6 @@ def process_input(
1032
  continue
1033
 
1034
  try:
1035
- # Process the structure
1036
  result = analyzer.analyze_structure(smiles)
1037
 
1038
  output_text += f"\nSummary for SMILES: {smiles}\n"
@@ -1053,7 +930,7 @@ def process_input(
1053
  output_text or "No analysis done.",
1054
  img_cyclic if 'img_cyclic' in locals() else None,
1055
  #img_linear if 'img_linear' in locals() else None,
1056
- #structure_files if structure_files else []
1057
  )
1058
 
1059
  iface = gr.Interface(
@@ -1063,11 +940,24 @@ iface = gr.Interface(
1063
  label="Enter SMILES string",
1064
  placeholder="Enter SMILES notation of peptide...",
1065
  lines=2
1066
- ),],
1067
- #gr.File(
1068
- #label="Or upload a text file with SMILES",
1069
- #file_types=[".txt"]
1070
- #)],
 
 
 
 
 
 
 
 
 
 
 
 
 
1071
  outputs=[
1072
  gr.Textbox(
1073
  label="Analysis Results",
@@ -1077,6 +967,10 @@ iface = gr.Interface(
1077
  label="2D Structure with Annotations",
1078
  type="pil"
1079
  ),
 
 
 
 
1080
  ],
1081
  title="Peptide Structure Analyzer and Visualizer",
1082
  description='''
@@ -1105,30 +999,4 @@ iface = gr.Interface(
1105
  )
1106
 
1107
  if __name__ == "__main__":
1108
- iface.launch(share=True)
1109
- """
1110
- from fastapi import FastAPI
1111
- import gradio as gr
1112
-
1113
- # 1) Make a FastAPI with no OpenAPI/docs routes
1114
- app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None)
1115
-
1116
- # 2) Build your Interface as before
1117
- iface = gr.Interface(
1118
- fn=process_input,
1119
- inputs=[ gr.Textbox(label="Enter SMILES string", lines=2) ],
1120
- outputs=[
1121
- gr.Textbox(label="Analysis Results", lines=10),
1122
- gr.Image(label="2D Structure with Annotations", type="pil"),
1123
- ],
1124
- title="Peptide Structure Analyzer and Visualizer",
1125
- flagging_mode="never"
1126
- )
1127
-
1128
- # 3) Mount it at “/”
1129
- app = gr.mount_gradio_app(app, iface, path="/")
1130
-
1131
- if __name__ == "__main__":
1132
- import uvicorn
1133
- uvicorn.run(app, host="0.0.0.0", port=7860)
1134
- """
 
1
  import os
2
  import gradio_client.utils as client_utils
3
+ # Monkey path gradio_client issue
4
  _original = client_utils._json_schema_to_python_type
5
  def _safe_json_schema_to_python_type(schema, defs=None):
6
  if isinstance(schema, bool):
7
  return "Any"
8
  return _original(schema, defs)
 
 
9
  client_utils._json_schema_to_python_type = _safe_json_schema_to_python_type
10
  client_utils.json_schema_to_python_type = _safe_json_schema_to_python_type
11
  import gradio as gr
 
35
  (r'C\(=O\)N[12]?', 'peptide_reverse') # Reverse peptide bond
36
  ]
37
  self.complex_residue_patterns = [
 
38
  (r'\[C[@]H\]\(CCCNC\(=O\)CCC\[C@@H\]\(NC\(=O\)CCCCCCCCCCCCCCCC\)C\(=O\)OC\(C\)\(C\)C\)', 'Kpg'),
39
  (r'CCCCCCCCCCCCCCCCC\(=O\)N\[C@H\]\(CCCC\(=O\)NCCC\[C@@H\]', 'Kpg'),
40
  (r'\[C@*H\]\(CSC\(c\d+ccccc\d+\)\(c\d+ccccc\d+\)c\d+ccc\(OC\)cc\d+\)', 'Cmt'),
41
+ (r'CSC\(c.*?c.*?OC\)', 'Cmt'),
42
+ (r'COc.*?ccc\(C\(SC', 'Cmt'),
43
+ (r'c2ccccc2\)c2ccccc2\)cc', 'Cmt'),
44
+ # Glu(OAll)
45
  (r'C=CCOC\(=O\)CC\[C@@H\]', 'Eal'),
46
  (r'\(C\)OP\(=O\)\(O\)OCc\d+ccccc\d+', 'Tpb'),
47
  #(r'COc\d+ccc\(C\(SC\[C@@H\]\d+.*?\)\(c\d+ccccc\d+\)c\d+ccccc\d+\)cc\d+', 'Cmt-cyclic'),
48
 
49
+ # Dtg - Asp(OtBu)-(Dmb)Gly
50
  (r'CN\(Cc\d+ccc\(OC\)cc\d+OC\)C\(=O\)\[C@H\]\(CC\(=O\)OC\(C\)\(C\)C\)', 'Dtg'),
51
  (r'C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
52
  (r'N\[C@@H\]\(CC\(=O\)OC\(C\)\(C\)C\)C\(=O\)N\(CC\d+=C\(C=C\(C=C\d+\)OC\)OC\)CC\(=O\)', 'Dtg'),
 
68
  }
69
  def preprocess_complex_residues(self, smiles):
70
  """Identify and protect complex residues with internal peptide bonds - improved to prevent overlaps"""
 
71
  complex_positions = []
72
 
 
73
  for pattern, residue_type in self.complex_residue_patterns:
74
  for match in re.finditer(pattern, smiles):
 
75
  if not any(pos['start'] <= match.start() < pos['end'] or
76
  pos['start'] < match.end() <= pos['end'] for pos in complex_positions):
77
  complex_positions.append({
 
81
  'pattern': match.group()
82
  })
83
 
 
84
  complex_positions.sort(key=lambda x: x['start'])
85
 
 
86
  if not complex_positions:
87
  return smiles, []
88
 
 
89
  preprocessed_smiles = smiles
90
+ offset = 0
91
 
92
  protected_residues = []
93
 
94
  for pos in complex_positions:
 
95
  start = pos['start'] + offset
96
  end = pos['end'] + offset
97
 
 
98
  complex_part = preprocessed_smiles[start:end]
99
 
 
100
  if not ('[C@H]' in complex_part or '[C@@H]' in complex_part):
101
+ continue
102
 
 
103
  placeholder = f"COMPLEX_RESIDUE_{len(protected_residues)}"
104
 
 
105
  preprocessed_smiles = preprocessed_smiles[:start] + placeholder + preprocessed_smiles[end:]
106
 
 
107
  offset += len(placeholder) - (end - start)
108
 
 
109
  protected_residues.append({
110
  'placeholder': placeholder,
111
  'type': pos['type'],
112
  'content': complex_part
113
  })
114
+
 
 
115
  return preprocessed_smiles, protected_residues
116
  def split_on_bonds(self, smiles, protected_residues=None):
117
  """Split SMILES into segments based on peptide bonds, with improved handling of protected residues"""
118
  positions = []
119
  used = set()
120
 
121
+ # Handle protected complex residues if any
122
  if protected_residues:
123
  for residue in protected_residues:
124
  match = re.search(residue['placeholder'], smiles)
 
148
  })
149
  used.update(range(match.start(), match.end()))
150
 
 
151
  for pattern, bond_type in self.bond_patterns:
152
  for match in re.finditer(pattern, smiles):
153
  if not any(p in range(match.start(), match.end()) for p in used):
 
159
  })
160
  used.update(range(match.start(), match.end()))
161
 
 
162
  bond_positions.sort(key=lambda x: x['start'])
163
 
 
164
  all_positions = positions + bond_positions
165
  all_positions.sort(key=lambda x: x['start'])
166
 
 
167
  segments = []
168
 
 
169
  if all_positions and all_positions[0]['start'] > 0:
170
  segments.append({
171
  'content': smiles[0:all_positions[0]['start']],
 
173
  'complex_after': all_positions[0]['pattern'] if all_positions[0]['type'] == 'complex' else None
174
  })
175
 
 
176
  for i in range(len(all_positions)-1):
177
  current = all_positions[i]
178
  next_pos = all_positions[i+1]
179
 
 
180
  if current['type'] == 'complex':
181
  segments.append({
182
  'content': current['content'],
 
184
  'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None,
185
  'complex_type': current['residue_type']
186
  })
 
187
  elif current['type'] == 'gly':
188
  segments.append({
189
  'content': 'NCC(=O)',
 
191
  'bond_after': next_pos['pattern'] if next_pos['type'] != 'complex' else None
192
  })
193
  else:
 
194
  content = smiles[current['end']:next_pos['start']]
195
  if content and next_pos['type'] != 'complex':
196
  segments.append({
 
241
  # Find all numbers used in ring closures
242
  ring_numbers = re.findall(r'(?:^|[^c])[0-9](?=[A-Z@\(\)])', smiles)
243
 
244
+ # Aromatic ring numbers
245
  aromatic_matches = re.findall(r'c[0-9](?:ccccc|c\[nH\]c)[0-9]', smiles)
246
  aromatic_cycles = []
247
  for match in aromatic_matches:
248
  numbers = re.findall(r'[0-9]', match)
249
  aromatic_cycles.extend(numbers)
250
 
 
251
  peptide_cycles = [n for n in ring_numbers if n not in aromatic_cycles]
252
 
253
  is_cyclic = len(peptide_cycles) > 0 and not smiles.endswith('C(=O)O')
 
281
  print("DIRECT MATCH: Found Cmt at beginning")
282
  return 'Cmt', mods
283
 
 
284
  if '[C@@H]3CCCN3C2=O)(c2ccccc2)c2ccccc2)cc' in content:
285
  print("DIRECT MATCH: Found Pro at end")
286
  return 'Pro', mods
287
+
288
  # Eal - Glu(OAll) - Multiple patterns
289
  if 'CCC(=O)OCC=C' in content or 'CC(=O)OCC=C' in content or 'C=CCOC(=O)CC' in content:
290
  return 'Eal', mods
291
+ # Proline (P)
292
  if any([
 
293
  (segment.get('bond_after', '').startswith(f'N{n}C(=O)') and 'CCC' in content and
294
  any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
295
  for n in '123456789'
 
297
  any(f'CCC{n}' for n in '123456789'))
298
  for n in '123456789'
299
  ]) or any([
 
300
  (f'CCCN{n}' in content and content.endswith('=O') and
301
  any(f'[C@@H]{n}' in content or f'[C@H]{n}' in content for n in '123456789'))
302
  for n in '123456789'
303
  ]) or any([
304
+ # CCC[C@H]n
305
  (content == f'CCC[C@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
306
  (content == f'CCC[C@@H]{n}' and segment.get('bond_before', '').startswith(f'C(=O)N{n}')) or
307
  # N-terminal Pro with any ring number
 
318
  # Tryptophan (W) - more specific indole pattern
319
  if re.search(r'c[0-9]c\[nH\]c[0-9]ccccc[0-9][0-9]', content) and \
320
  'c[nH]c' in content.replace(' ', ''):
 
321
  if '[C@H](CC' in content: # D-form
322
  return 'trp', mods
323
  return 'Trp', mods
324
 
325
  # Lysine (K) - both patterns
326
  if '[C@@H](CCCCN)' in content or '[C@H](CCCCN)' in content:
 
327
  if '[C@H](CCCCN)' in content: # D-form
328
  return 'lys', mods
329
  return 'Lys', mods
330
 
331
  # Arginine (R) - both patterns
332
  if '[C@@H](CCCNC(=N)N)' in content or '[C@H](CCCNC(=N)N)' in content:
 
333
  if '[C@H](CCCNC(=N)N)' in content: # D-form
334
  return 'arg', mods
335
  return 'Arg', mods
336
 
 
337
  if content == 'C' and segment.get('bond_before') and segment.get('bond_after'):
 
338
  if ('C(=O)N' in segment['bond_before'] or 'NC(=O)' in segment['bond_before'] or 'N(C)C(=O)' in segment['bond_before']) and \
339
  ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
340
  return 'Gly', mods
341
 
 
342
  if 'CNC' in content and any(f'C{i}=' in content for i in range(1, 10)):
343
+ return 'Gly', mods #'CNC1=O'
344
  if not segment.get('bond_before') and segment.get('bond_after'):
345
  if content == 'C' or content == 'NC':
346
  if ('NC(=O)' in segment['bond_after'] or 'C(=O)N' in segment['bond_after'] or 'N(C)C(=O)' in segment['bond_after']):
 
348
 
349
  # Leucine patterns (L/l)
350
  if 'CC(C)C[C@H]' in content or 'CC(C)C[C@@H]' in content or '[C@@H](CC(C)C)' in content or '[C@H](CC(C)C)' in content or (('N[C@H](CCC(C)C)' in content or 'N[C@@H](CCC(C)C)' in content) and segment.get('bond_before') is None):
 
351
  if '[C@H](CC(C)C)' in content or 'CC(C)C[C@H]' in content: # D-form
352
  return 'leu', mods
353
  return 'Leu', mods
354
 
355
  # Threonine patterns (T/t)
356
  if '[C@@H]([C@@H](C)O)' in content or '[C@H]([C@H](C)O)' in content or '[C@@H]([C@H](C)O)' in content or '[C@H]([C@@H](C)O)' in content:
 
357
  if '[C@H]([C@@H](C)O)' in content: # D-form
358
  return 'thr', mods
359
  return 'Thr', mods
 
363
 
364
  # Phenylalanine patterns (F/f)
365
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content) or re.search(r'\[C@@H\]\(Cc\d+ccccc\d+\)', content):
 
366
  if re.search(r'\[C@H\]\(Cc\d+ccccc\d+\)', content): # D-form
367
  return 'phe', mods
368
  return 'Phe', mods
 
371
  '[C@H](C(C)C)' in content or '[C@@H](C(C)C)' in content or
372
  'C(C)C[C@H]' in content or 'C(C)C[C@@H]' in content):
373
 
 
374
  if not any(p in content for p in ['CC(C)C[C@H]', 'CC(C)C[C@@H]', 'CCC(=O)']):
 
375
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
376
  return 'val', mods
377
  return 'Val', mods
378
 
379
  # Isoleucine patterns (I/i)
 
380
  if (any(['CC[C@@H](C)' in content, '[C@@H](C)CC' in content, '[C@@H](CC)C' in content,
381
  'C(C)C[C@@H]' in content, '[C@@H]([C@H](C)CC)' in content, '[C@H]([C@@H](C)CC)' in content,
382
  '[C@@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
 
386
  'CC[C@H](C)[C@H]' in content, 'CC[C@@H](C)[C@@H]' in content])
387
  and 'CC(C)C' not in content): # Exclude valine pattern
388
 
 
389
  if any(['[C@H]([C@@H](CC)C)' in content, '[C@H](CC)C' in content,
390
  '[C@H]([C@@H](C)CC)' in content, '[C@H]([C@H](C)CC)' in content,
391
  'C[C@@H](CC)[C@H]' in content, 'C[C@H](CC)[C@H]' in content,
392
  'CC[C@@H](C)[C@H]' in content, 'CC[C@H](C)[C@H]' in content]):
393
  # D-form
394
  return 'ile', mods
 
395
  return 'Ile', mods
396
+ # Tpb - Thr(PO(OBzl)OH)
397
  if re.search(r'\(C\)OP\(=O\)\(O\)OCc[0-9]ccccc[0-9]', content) or 'OP(=O)(O)OCC' in content:
398
  return 'Tpb', mods
399
 
400
  # Alanine patterns (A/a)
401
  if ('[C@H](C)' in content or '[C@@H](C)' in content):
402
  if not any(p in content for p in ['C(C)C', 'COC', 'CN(', 'C(C)O', 'CC[C@H]', 'CC[C@@H]']):
 
403
  if '[C@H](C)' in content: # D-form
404
  return 'ala', mods
405
  return 'Ala', mods
406
 
407
  # Tyrosine patterns (Y/y)
408
  if re.search(r'Cc[0-9]ccc\(O\)cc[0-9]', content):
 
409
  if '[C@H](Cc1ccc(O)cc1)' in content: # D-form
410
  return 'tyr', mods
411
  return 'Tyr', mods
 
413
  # Serine patterns (S/s)
414
  if '[C@H](CO)' in content or '[C@@H](CO)' in content:
415
  if not ('C(C)O' in content or 'COC' in content):
 
416
  if '[C@H](CO)' in content: # D-form
417
  return 'ser', mods
418
  return 'Ser', mods
419
 
420
  if 'CSSC' in content:
421
+ # cysteine-cysteine bridge
422
  if re.search(r'\[C@@H\].*CSSC.*\[C@@H\]', content) or re.search(r'\[C@H\].*CSSC.*\[C@H\]', content):
423
  if '[C@H]' in content and not '[C@@H]' in content: # D-form
424
  return 'cys-cys', mods
425
  return 'Cys-Cys', mods
426
 
427
+ # N-terminal amine group
428
  if '[C@@H](N)CSSC' in content or '[C@H](N)CSSC' in content:
429
  if '[C@H](N)CSSC' in content: # D-form
430
  return 'cys-cys', mods
431
  return 'Cys-Cys', mods
432
 
433
+ # C-terminal carboxyl
434
  if 'CSSC[C@@H](C(=O)O)' in content or 'CSSC[C@H](C(=O)O)' in content:
435
  if 'CSSC[C@H](C(=O)O)' in content: # D-form
436
  return 'cys-cys', mods
 
438
 
439
  # Cysteine patterns (C/c)
440
  if '[C@H](CS)' in content or '[C@@H](CS)' in content:
 
441
  if '[C@H](CS)' in content: # D-form
442
  return 'cys', mods
443
  return 'Cys', mods
444
 
445
  # Methionine patterns (M/m)
446
  if ('CCSC' in content) or ("CSCC" in content):
 
447
  if '[C@H](CCSC)' in content: # D-form
448
  return 'met', mods
449
  elif '[C@H]' in content:
 
452
 
453
  # Glutamine patterns (Q/q)
454
  if (content == '[C@@H](CC' or content == '[C@H](CC' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CCC(=O)N' in content) or ('CCC(N)=O' in content):
 
455
  if '[C@H](CCC(=O)N)' in content: # D-form
456
  return 'gln', mods
457
  return 'Gln', mods
458
 
459
  # Asparagine patterns (N/n)
460
  if (content == '[C@@H](C' or content == '[C@H](C' and segment.get('bond_before')=='C(=O)N' and segment.get('bond_after')=='C(=O)N') or ('CC(=O)N' in content) or ('CCN(=O)' in content) or ('CC(N)=O' in content):
 
461
  if '[C@H](CC(=O)N)' in content: # D-form
462
  return 'asn', mods
463
  return 'Asn', mods
464
 
465
  # Glutamic acid patterns (E/e)
466
  if ('CCC(=O)O' in content):
 
467
  if '[C@H](CCC(=O)O)' in content: # D-form
468
  return 'glu', mods
469
  return 'Glu', mods
470
 
471
  # Aspartic acid patterns (D/d)
472
  if ('CC(=O)O' in content):
 
473
  if '[C@H](CC(=O)O)' in content: # D-form
474
  return 'asp', mods
475
  return 'Asp', mods
476
 
477
  if re.search(r'Cc\d+c\[nH\]cn\d+', content) or re.search(r'Cc\d+cnc\[nH\]\d+', content):
 
478
  if '[C@H]' in content: # D-form
479
  return 'his', mods
480
  return 'His', mods
 
484
  if ('N[C@@H](CCCC)' in content or '[C@@H](CCCC)' in content or 'CCCC[C@@H]' in content or
485
  'N[C@H](CCCC)' in content or '[C@H](CCCC)' in content) and 'CC(C)' not in content:
486
  return 'Nle', mods
487
+
 
488
  if 'C(C)(C)(N)' in content:
489
  return 'Aib', mods
490
 
 
491
  if 'C(C)(C)' in content and 'OC(C)(C)C' not in content:
492
  if (segment.get('bond_before') and segment.get('bond_after') and
493
  any(bond in segment['bond_before'] for bond in ['C(=O)N', 'NC(=O)', 'N(C)C(=O)']) and
494
  any(bond in segment['bond_after'] for bond in ['NC(=O)', 'C(=O)N', 'N(C)C(=O)'])):
495
  return 'Aib', mods
496
 
497
+ # Dtg - Asp(OtBu)-(Dmb)Gly
498
  if 'CC(=O)OC(C)(C)C' in content and 'CC1=C(C=C(C=C1)OC)OC' in content:
499
  return 'Dtg', mods
500
 
501
 
502
+ # Kpg - Lys(palmitoyl-Glu-OtBu)
503
  if 'CCCNC(=O)' in content and 'CCCCCCCCCCCC' in content:
504
  return 'Kpg', mods
505
 
506
 
 
507
  return None, mods
508
 
509
  def get_modifications(self, segment):
 
524
 
525
  return mods
526
 
527
+ def analyze_structure(self, smiles, verbose=False):
528
+ logs = []
 
 
 
529
  preprocessed_smiles, protected_residues = self.preprocess_complex_residues(smiles)
530
+
 
 
 
 
 
 
 
531
  is_cyclic, peptide_cycles, aromatic_cycles = self.is_cyclic(smiles)
532
 
 
533
  segments = self.split_on_bonds(preprocessed_smiles, protected_residues)
534
 
 
535
  sequence = []
536
  for i, segment in enumerate(segments):
537
+ if verbose:
538
+ logs.append(f"\nSegment {i}:")
539
+ logs.append(f" Content: {segment.get('content','None')}")
540
+ logs.append(f" Bond before: {segment.get('bond_before','None')}")
541
+ logs.append(f" Bond after: {segment.get('bond_after','None')}")
542
+
543
  residue, mods = self.identify_residue(segment)
544
  if residue:
545
  if mods:
546
  sequence.append(f"{residue}({','.join(mods)})")
547
  else:
548
  sequence.append(residue)
 
 
 
549
  else:
550
+ logs.append(f"Warning: Could not identify residue in segment: {segment.get('content', 'None')}")
551
 
 
552
  three_letter = '-'.join(sequence)
553
 
 
554
  one_letter = ''.join(self.three_to_one.get(aa.split('(')[0], 'X') for aa in sequence)
555
 
556
  if is_cyclic:
557
  three_letter = f"cyclo({three_letter})"
558
  one_letter = f"cyclo({one_letter})"
559
+
 
 
 
 
 
 
560
  return {
561
  'three_letter': three_letter,
562
  'one_letter': one_letter,
563
  'is_cyclic': is_cyclic,
564
+ 'residues': sequence,
565
+ 'details': "\n".join(logs)
566
  }
567
 
568
  def annotate_cyclic_structure(mol, sequence):
 
571
 
572
  drawer = Draw.rdMolDraw2D.MolDraw2DCairo(2000, 2000)
573
 
 
574
  drawer.drawOptions().addAtomIndices = False
575
  drawer.DrawMolecule(mol)
576
  drawer.FinishDrawing()
577
 
 
578
  img = Image.open(BytesIO(drawer.GetDrawingText()))
579
  draw = ImageDraw.Draw(img)
580
  try:
 
586
  print("Warning: TrueType fonts not available, using default font")
587
  small_font = ImageFont.load_default()
588
 
 
589
  seq_text = f"Sequence: {sequence}"
590
  bbox = draw.textbbox((1000, 100), seq_text, font=small_font)
591
  padding = 10
 
668
  text += f" ({', '.join(mods)})"
669
  color = 'blue'
670
  else:
 
671
  text = f"Bond {i}: "
672
  if 'O-linked' in segment.get('bond_after', ''):
673
  text += "ester"
 
809
  def process_input(
810
  smiles_input=None,
811
  file_obj=None,
812
+ #show_linear=False,
813
  show_segment_details=False,
814
  generate_3d=False,
815
  use_uff=False
 
862
  except Exception as e:
863
  return f"Error generating 3D structures: {str(e)}", None, None, []
864
 
865
+ analysis = analyzer.analyze_structure(smiles, verbose=show_segment_details)
866
  three_letter = analysis['three_letter']
867
  one_letter = analysis['one_letter']
868
  is_cyclic = analysis['is_cyclic']
869
+ details = analysis.get('details', "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
870
 
871
  img_cyclic = annotate_cyclic_structure(mol, three_letter)
 
 
 
 
 
 
 
 
 
 
872
 
873
+ summary = ""
874
+ if show_segment_details and details:
875
+ summary += "Segment Analysis:\n"
876
+ summary += details + "\n\n"
877
  summary = "Summary:\n"
878
  summary += f"Sequence: {three_letter}\n"
879
  summary += f"One-letter code: {one_letter}\n"
880
  summary += f"Is Cyclic: {'Yes' if is_cyclic else 'No'}\n"
 
 
 
881
 
882
  if structure_files:
883
  summary += "\n3D Structures Generated:\n"
 
885
  summary += f"- {os.path.basename(filepath)}\n"
886
 
887
  #return summary, img_cyclic, img_linear, structure_files if structure_files else None
888
+ return summary, img_cyclic, structure_files or None
889
 
890
  except Exception as e:
891
  #return f"Error processing SMILES: {str(e)}", None, None, []
892
+ return f"Error processing SMILES: {str(e)}", None, []
893
  # Handle file input
894
  if file_obj is not None:
895
  try:
 
910
  continue
911
 
912
  try:
 
913
  result = analyzer.analyze_structure(smiles)
914
 
915
  output_text += f"\nSummary for SMILES: {smiles}\n"
 
930
  output_text or "No analysis done.",
931
  img_cyclic if 'img_cyclic' in locals() else None,
932
  #img_linear if 'img_linear' in locals() else None,
933
+ structure_files if structure_files else []
934
  )
935
 
936
  iface = gr.Interface(
 
940
  label="Enter SMILES string",
941
  placeholder="Enter SMILES notation of peptide...",
942
  lines=2
943
+ ),
944
+ gr.File(
945
+ label="Or upload a text file with SMILES",
946
+ file_types=[".txt"]
947
+ ),
948
+ gr.Checkbox(
949
+ label="Show show segmentation details",
950
+ value=False
951
+ ),
952
+ gr.Checkbox(
953
+ label="Generate 3D structure (sdf file format)",
954
+ value=False
955
+ ),
956
+ gr.Checkbox(
957
+ label="Use UFF optimization (may take long)",
958
+ value=False
959
+ )
960
+ ],
961
  outputs=[
962
  gr.Textbox(
963
  label="Analysis Results",
 
967
  label="2D Structure with Annotations",
968
  type="pil"
969
  ),
970
+ gr.File(
971
+ label="3D Structure Files",
972
+ file_count="multiple"
973
+ )
974
  ],
975
  title="Peptide Structure Analyzer and Visualizer",
976
  description='''
 
999
  )
1000
 
1001
  if __name__ == "__main__":
1002
+ iface.launch(share=True)