-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmulti_molecule_text_parser.py
More file actions
89 lines (72 loc) · 2.67 KB
/
multi_molecule_text_parser.py
File metadata and controls
89 lines (72 loc) · 2.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from __future__ import annotations
from pydantic import BaseModel, ConfigDict
from typing import Iterator
class XyzFrame(BaseModel):
model_config = ConfigDict(frozen=True)
natoms: int
comment: str
atom_lines: list[str]
@property
def energy(self) -> float | None:
# CREST typically writes energy as the whole comment line (possibly with spaces)
try:
return float(self.comment.strip())
except Exception:
return None
def to_xyz_string(self) -> str:
return "\n".join([str(self.natoms), self.comment, *self.atom_lines]) + "\n"
def iter_multixyz_frames(text: str) -> Iterator[XyzFrame]:
"""
Parse a concatenated (multi-)XYZ file into individual XYZ frames.
Tolerant to blank lines between frames.
"""
lines = [ln.rstrip("\n") for ln in text.splitlines()]
i = 0
n = len(lines)
def skip_blanks(idx: int) -> int:
while idx < n and not lines[idx].strip():
idx += 1
return idx
i = skip_blanks(i)
while i < n:
# line 1: natoms
try:
natoms = int(lines[i].strip())
except Exception as e:
raise ValueError(f"Expected atom count at line {i+1}, got: {lines[i]!r}") from e
if natoms <= 0:
raise ValueError(f"Atom count must be > 0 at line {i+1}, got: {natoms}")
# line 2: comment
if i + 1 >= n:
raise ValueError(f"Truncated XYZ: missing comment line after line {i+1}")
comment = lines[i + 1]
# next natoms lines: atoms
start = i + 2
end = start + natoms
if end > n:
raise ValueError(
f"Truncated XYZ frame starting at line {i+1}: "
f"expected {natoms} atom lines, got {max(0, n - start)}"
)
atom_lines = lines[start:end]
# Optional sanity check: each atom line has at least 4 tokens
for k, ln in enumerate(atom_lines, start=1):
if len(ln.split()) < 4:
raise ValueError(
f"Invalid atom line #{k} in frame starting at line {i+1}: {ln!r}"
)
yield XyzFrame(natoms=natoms, comment=comment, atom_lines=atom_lines)
i = skip_blanks(end)
def iter_sdf_frames(text: str) -> Iterator[str]:
"""
Parse a multi-molecule SDF file into individual molblocks.
Molecules are delimited by '$$$$' lines.
"""
# Use a more robust split that handles both Windows and Unix line endings
# and potential trailing whitespace
import re
molblocks = re.split(r'\$\$\$\$\r?\n?', text)
for block in molblocks:
block = block.strip()
if block:
yield block