-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathdata.py
More file actions
211 lines (183 loc) · 8.38 KB
/
data.py
File metadata and controls
211 lines (183 loc) · 8.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import joblib, json
import numpy as np
import pandas as pd
from functools import partial
from tqdm import tqdm
# Prevent tqdm from printing multiline progress bars
tqdm = partial(tqdm, position=0, leave=True)
from sklearn.preprocessing import OneHotEncoder
from pymatgen import Structure
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
def data_query(mp_api_key, max_elms=3, min_elms=3, max_sites=20, include_te=False):
"""
The function queries data from Materials Project.
Parameters
----------
mp_api_key : str
The API key for Mateirals Project.
max_elms : int, optional
Maximum number of components/elements for crystals to be queried.
The default is 3.
min_elms : int, optional
Minimum number of components/elements for crystals to be queried.
The default is 3.
max_sites : int, optional
Maximum number of components/elements for crystals to be queried.
The default is 20.
include_te : bool, optional
DESCRIPTION. The default is False.
Returns
-------
dataframe : pandas dataframe
Dataframe returned by MPDataRetrieval.
"""
mpdr = MPDataRetrieval(mp_api_key)
# Specify query criteria in MongoDB style
query_criteria = {
'e_above_hull':{'$lte': 0.08}, # eV/atom
'nelements': {'$gte': min_elms, '$lte': max_elms},
'nsites':{'$lte': max_sites},
}
# Specify properties to be queried, properties avaible are at https://github.com/materialsproject/mapidoc/tree/master/materials
query_properties = [
'material_id',
'formation_energy_per_atom',
'band_gap',
'pretty_formula',
'e_above_hull',
'elements',
'cif',
'spacegroup.number'
]
# Obtain queried dataframe containing CIFs and groud-state property labels
dataframe = mpdr.get_dataframe(
criteria = query_criteria,
properties = query_properties,
)
dataframe['ind'] = np.arange(len(dataframe))
if include_te:
dataframe['ind'] = np.arange(0, len(dataframe))
# Read thermoelectric properties from https://datadryad.org/stash/dataset/doi:10.5061/dryad.gn001
te = pd.read_csv('data/thermoelectric_prop.csv', index_col=0)
te = te.dropna()
# Get compound index that has both ground-state and thermoelectric properties
ind = dataframe.index.intersection(te.index)
# Concatenate thermoelectric properties to corresponding compounds
dataframe = pd.concat([dataframe, te.loc[ind,:]], axis=1)
dataframe['Seebeck'] = dataframe['Seebeck'].apply(np.abs)
return dataframe
def FTCP_represent(dataframe, max_elms=3, max_sites=20, return_Nsites=False):
'''
This function represents crystals in the dataframe to their FTCP representations.
Parameters
----------
dataframe : pandas dataframe
Dataframe containing cyrstals to be converted;
CIFs need to be included under column 'cif'.
max_elms : int, optional
Maximum number of components/elements for crystals in the dataframe.
The default is 3.
max_sites : int, optional
Maximum number of sites for crystals in the dataframe.
The default is 20.
return_Nsites : bool, optional
Whether to return number of sites to be used in the error calculation
of reconstructed site coordinate matrix
Returns
-------
FTCP : numpy ndarray
FTCP representation as numpy array for crystals in the dataframe.
'''
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")
# Read string of elements considered in the study
elm_str = joblib.load('data/element.pkl')
# Build one-hot vectors for the elements
elm_onehot = np.arange(1, len(elm_str)+1)[:,np.newaxis]
elm_onehot = OneHotEncoder().fit_transform(elm_onehot).toarray()
# Read elemental properties from atom_init.json from CGCNN (https://github.com/txie-93/cgcnn)
with open('data/atom_init.json') as f:
elm_prop = json.load(f)
elm_prop = {int(key): value for key, value in elm_prop.items()}
# Initialize FTCP array
FTCP = []
if return_Nsites:
Nsites = []
# Represent dataframe
op = tqdm(dataframe.index)
for idx in op:
op.set_description('representing data as FTCP ...')
crystal = Structure.from_str(dataframe['cif'][idx],fmt="cif")
# Obtain element matrix
elm, elm_idx = np.unique(crystal.atomic_numbers, return_index=True)
# Sort elm to the order of sites in the CIF
site_elm = np.array(crystal.atomic_numbers)
elm = site_elm[np.sort(elm_idx)]
# Zero pad element matrix to have at least 3 columns
ELM = np.zeros((len(elm_onehot), max(max_elms, 3),))
ELM[:, :len(elm)] = elm_onehot[elm-1,:].T
# Obtain lattice matrix
latt = crystal.lattice
LATT = np.array((latt.abc, latt.angles))
LATT = np.pad(LATT, ((0, 0), (0, max(max_elms, 3)-LATT.shape[1])), constant_values=0)
# Obtain site coordinate matrix
SITE_COOR = np.array([site.frac_coords for site in crystal])
# Pad site coordinate matrix up to max_sites rows and max_elms columns
SITE_COOR = np.pad(SITE_COOR, ((0, max_sites-SITE_COOR.shape[0]),
(0, max(max_elms, 3)-SITE_COOR.shape[1])), constant_values=0)
# Obtain site occupancy matrix
elm_inverse = np.zeros(len(crystal), dtype=int) # Get the indices of elm that can be used to reconstruct site_elm
for count, e in enumerate(elm):
elm_inverse[np.argwhere(site_elm == e)] = count
SITE_OCCU = OneHotEncoder().fit_transform(elm_inverse[:,np.newaxis]).toarray()
# Zero pad site occupancy matrix to have at least 3 columns, and max_elms rows
SITE_OCCU = np.pad(SITE_OCCU, ((0, max_sites-SITE_OCCU.shape[0]),
(0, max(max_elms, 3)-SITE_OCCU.shape[1])), constant_values=0)
# Obtain elemental property matrix
ELM_PROP = np.zeros((len(elm_prop[1]), max(max_elms, 3),))
ELM_PROP[:, :len(elm)] = np.array([elm_prop[e] for e in elm]).T
# Obtain real-space features; note the zero padding is to cater for the distance of k point in the reciprocal space
REAL = np.concatenate((ELM, LATT, SITE_COOR, SITE_OCCU, np.zeros((1, max(max_elms, 3))), ELM_PROP), axis=0)
# Obtain FTCP matrix
recip_latt = latt.reciprocal_lattice_crystallographic
# First use a smaller radius, if not enough k points, then proceed with a larger radius
hkl, g_hkl, ind, _ = recip_latt.get_points_in_sphere([[0, 0, 0]], [0, 0, 0], 1.297, zip_results=False)
if len(hkl) < 60:
hkl, g_hkl, ind, _ = recip_latt.get_points_in_sphere([[0, 0, 0]], [0, 0, 0], 1.4, zip_results=False)
# Drop (000)
not_zero = g_hkl!=0
hkl = hkl[not_zero,:]
g_hkl = g_hkl[not_zero]
# Convert miller indices to be type int
hkl = hkl.astype('int16')
# Sort hkl
hkl_sum = np.sum(np.abs(hkl),axis=1)
h = -hkl[:,0]
k = -hkl[:,1]
l = -hkl[:,2]
hkl_idx = np.lexsort((l,k,h,hkl_sum))
# Take the closest 59 k points (to origin)
hkl_idx = hkl_idx[:59]
hkl = hkl[hkl_idx,:]
g_hkl = g_hkl[hkl_idx]
# Vectorized computation of (k dot r) for all hkls and fractional coordinates
k_dot_r = np.einsum('ij,kj->ik', hkl, SITE_COOR[:, :3]) # num_hkl x num_sites
# Obtain FTCP matrix
F_hkl = np.matmul(np.pad(ELM_PROP[:,elm_inverse], ((0, 0),
(0, max_sites-len(elm_inverse))), constant_values=0),
np.pi*k_dot_r.T)
# Obtain reciprocal-space features
RECIP = np.zeros((REAL.shape[0], 59,))
# Prepend distances of k points to the FTCP matrix in the reciprocal-space features
RECIP[-ELM_PROP.shape[0]-1, :] = g_hkl
RECIP[-ELM_PROP.shape[0]:, :] = F_hkl
# Obtain FTCP representation, and add to FTCP array
FTCP.append(np.concatenate([REAL, RECIP], axis=1))
if return_Nsites:
Nsites.append(len(crystal))
FTCP = np.stack(FTCP)
if not return_Nsites:
return FTCP
else:
return FTCP, np.array(Nsites)