DNDSR 0.2.1
Distributed Numeric Data Structure for CFV
Loading...
Searching...
No Matches
test_restart_redistribute.py
Go to the documentation of this file.
1"""
2Functional test for ArrayPair redistribution via Euler solver restart.
3
4The test verifies that DOF data written in a restart file by one MPI partition
5can be correctly read by a different partition (different np), producing
6identical solver behaviour.
7
8Steps:
9 1. Run the Euler IV solver for 20 time steps with np=2, writing an H5 restart.
10 2. Load that restart and immediately write it back at np=2 (reference).
11 3. Load that restart and immediately write it back at np=3 (redistributed read).
12 4. Compare the two restart H5 files: the DOF data must match to machine precision.
13
14The file contains three test variants: same-np reseed, cross-np redistribution,
15and a large-mesh multi-np (np=4 → np=4..8) case.
16
17Usage:
18 pytest test/Euler/test_restart_redistribute.py -s
19 # or directly:
20 python test/Euler/test_restart_redistribute.py
21"""
22
23import json
24import os
25import re
26import shutil
27import subprocess
28import sys
29import tempfile
30
31import h5py
32import numpy as np
33import pytest
34
35# ---------------------------------------------------------------------------
36# Paths
37# ---------------------------------------------------------------------------
38SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
39PROJECT_ROOT = os.path.realpath(os.path.join(SCRIPT_DIR, "..", ".."))
40BUILD_DIR = os.path.join(PROJECT_ROOT, "build")
41EULER_EXE = os.path.join(BUILD_DIR, "app", "euler.exe")
42BASE_CONFIG = os.path.join(PROJECT_ROOT, "cases",
43 "euler", "euler_config_IV.json")
44MESH_SMALL = os.path.join(PROJECT_ROOT, "data", "mesh",
45 "IV10_10.cgns") # 10x10 = 100 cells
46MESH_LARGE = os.path.join(PROJECT_ROOT, "data", "mesh",
47 "IV10_20.cgns") # 20x20 = 400 cells
48
49
51 """Remove // comments from JSON-with-comments (DNDS convention)."""
52 return re.sub(r"//.*", "", text)
53
54
55def _load_json(path):
56 with open(path) as f:
57 return json.loads(_strip_json_comments(f.read()))
58
59
60def _write_json(path, obj):
61 with open(path, "w") as f:
62 json.dump(obj, f, indent=4)
63
64
65def _run_solver(np_count, config_path, work_dir, overrides=None, timeout=300):
66 """Run the Euler solver via mpirun and return the process result."""
67 cmd = [
68 "mpirun", "--oversubscribe", "-np", str(np_count),
69 EULER_EXE, config_path,
70 ]
71 if overrides:
72 for k, v in overrides:
73 cmd.extend(["-k", k, "-v", v])
74
75 env = os.environ.copy()
76 result = subprocess.run(
77 cmd, cwd=work_dir, timeout=timeout,
78 stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
79 env=env,
80 )
81 stdout_text = result.stdout.decode("utf-8", errors="replace")
82 if result.returncode != 0:
83 print(f"=== SOLVER FAILED (np={np_count}) ===")
84 print(stdout_text[-4000:])
85 return result, stdout_text
86
87
88def _read_h5_u_data(h5_path):
89 """Read the DOF 'u' data from a restart H5 file.
90
91 Returns (origIndex, data) where:
92 - origIndex: 1-D array of original cell indices (global key)
93 - data: 2-D array of DOF values, shape (nGlobal, nVars)
94 """
95 with h5py.File(h5_path, "r") as f:
96 # The H5 layout from ArrayPair::WriteSerialize with origIndex:
97 # /u/origIndex -- flat array of original cell indices
98 # /u/father/data -- for plain ParArray
99 # /u/father/array/data -- for ArrayEigenMatrix (extra 'array' sub-group)
100 orig_idx = f["u"]["origIndex"][:]
101
102 father_grp = f["u"]["father"]
103 if "data" in father_grp:
104 raw_data = father_grp["data"][:]
105 elif "array" in father_grp and "data" in father_grp["array"]:
106 raw_data = father_grp["array"]["data"][:]
107 else:
108 raise KeyError(
109 f"Cannot find data in {h5_path}: /u/father/ contains {list(father_grp.keys())}")
110
111 n_global = len(orig_idx)
112 assert raw_data.size % n_global == 0, (
113 f"data size {raw_data.size} not divisible by nGlobal {n_global}"
114 )
115 n_vars = raw_data.size // n_global
116 data = raw_data.reshape(n_global, n_vars)
117 return orig_idx, data
118
119
120def _gather_by_orig_index(orig_idx, data):
121 """Reorder data rows by original index so that global ordering is canonical."""
122 order = np.argsort(orig_idx)
123 return orig_idx[order], data[order]
124
125
126# ---------------------------------------------------------------------------
127# The actual test
128# ---------------------------------------------------------------------------
129@pytest.fixture(scope="module")
131 """Create a temporary working directory for the test."""
132 d = tempfile.mkdtemp(prefix="dnds_test_redist_")
133 yield d
134 # Cleanup after test (comment out for debugging)
135 shutil.rmtree(d, ignore_errors=True)
136
137
138def _make_step1_config(work_dir, mesh_file=MESH_SMALL):
139 """Config for the initial 20-step run (produces the restart)."""
140 cfg = _load_json(BASE_CONFIG)
141
142 # Time march: 20 steps, explicit ESDIRK4 (odeCode=0), small dt
143 cfg["timeMarchControl"]["nTimeStep"] = 20
144 cfg["timeMarchControl"]["tEnd"] = 1e10 # won't hit tEnd, use nTimeStep
145 cfg["timeMarchControl"]["dtImplicit"] = 1.25e-3
146 cfg["timeMarchControl"]["odeCode"] = 0
147 cfg["timeMarchControl"]["steadyQuit"] = False
148 cfg["timeMarchControl"]["useRestart"] = False
149
150 # Output: minimal
151 cfg["outputControl"]["nDataOut"] = 1000000
152 cfg["outputControl"]["nDataOutC"] = 1000000
153 cfg["outputControl"]["nConsoleCheck"] = 5
154 cfg["outputControl"]["dataOutAtInit"] = False
155 # Restart output every 20 steps (i.e. at the end)
156 cfg["outputControl"]["nRestartOut"] = 20
157 cfg["outputControl"]["nRestartOutC"] = 1000000
158
159 # Mesh
160 cfg["dataIOControl"]["meshFile"] = mesh_file
161 out_base = os.path.join(work_dir, "step1", "out")
162 os.makedirs(os.path.dirname(out_base), exist_ok=True)
163 cfg["dataIOControl"]["outPltName"] = out_base
164
165 # H5 restart writer
166 cfg["dataIOControl"]["restartWriter"] = {
167 "type": "H5",
168 "hdfDeflateLevel": 0,
169 "hdfChunkSize": 0,
170 "hdfCollOnMeta": True,
171 "hdfCollOnData": False,
172 "jsonBinaryDeflateLevel": 5,
173 "jsonUseCodecOnUInt8": True,
174 }
175
176 # Use 1st order for speed (maxOrder=1)
177 cfg["vfvSettings"]["maxOrder"] = 1
178 cfg["vfvSettings"]["intOrder"] = 1
179
180 # No bisection
181 cfg["dataIOControl"]["meshDirectBisect"] = 0
182 cfg["dataIOControl"]["meshReorderCells"] = False
183
184 path = os.path.join(work_dir, "step1_config.json")
185 _write_json(path, cfg)
186 return path, cfg
187
188
189def _make_step2_config(work_dir, restart_h5_path, tag, np_count,
190 reorder_cells=False, mesh_bisect=0, mesh_file=MESH_SMALL):
191 """Config for the 1-step restart run."""
192 cfg = _load_json(BASE_CONFIG)
193
194 # Time march: 1 step from restart -- load, run one step, and write
195 cfg["timeMarchControl"]["nTimeStep"] = 1
196 cfg["timeMarchControl"]["tEnd"] = 1e10
197 cfg["timeMarchControl"]["dtImplicit"] = 1.25e-3
198 cfg["timeMarchControl"]["odeCode"] = 0
199 cfg["timeMarchControl"]["steadyQuit"] = False
200 cfg["timeMarchControl"]["useRestart"] = True
201
202 # Restart state: start from step 20
203 cfg["restartState"] = {
204 "iStep": 20,
205 "iStepInternal": -1,
206 "odeCodePrev": 0,
207 "lastRestartFile": restart_h5_path,
208 "otherRestartFile": "",
209 "otherRestartStoreDim": [0],
210 }
211
212 # Output: write restart immediately at init, no step-based output
213 cfg["outputControl"]["nDataOut"] = 1000000
214 cfg["outputControl"]["nDataOutC"] = 1000000
215 cfg["outputControl"]["nConsoleCheck"] = 1
216 cfg["outputControl"]["dataOutAtInit"] = False
217 cfg["outputControl"]["restartOutAtInit"] = True
218 cfg["outputControl"]["nRestartOut"] = 1000000
219 cfg["outputControl"]["nRestartOutC"] = 1000000
220
221 # Mesh
222 cfg["dataIOControl"]["meshFile"] = mesh_file
223 out_base = os.path.join(work_dir, f"step2_{tag}", "out")
224 os.makedirs(os.path.dirname(out_base), exist_ok=True)
225 cfg["dataIOControl"]["outPltName"] = out_base
226
227 # H5 restart writer
228 cfg["dataIOControl"]["restartWriter"] = {
229 "type": "H5",
230 "hdfDeflateLevel": 0,
231 "hdfChunkSize": 0,
232 "hdfCollOnMeta": True,
233 "hdfCollOnData": False,
234 "jsonBinaryDeflateLevel": 5,
235 "jsonUseCodecOnUInt8": True,
236 }
237
238 cfg["vfvSettings"]["maxOrder"] = 1
239 cfg["vfvSettings"]["intOrder"] = 1
240 cfg["dataIOControl"]["meshDirectBisect"] = mesh_bisect
241 cfg["dataIOControl"]["meshReorderCells"] = reorder_cells
242
243 path = os.path.join(work_dir, f"step2_{tag}_config.json")
244 _write_json(path, cfg)
245 return path, cfg
246
247
248def _find_restart_h5(search_dir, label=""):
249 """Find restart H5 files in directory tree."""
250 restart_files = []
251 for root, dirs, files in os.walk(search_dir):
252 for f in files:
253 if f.endswith(".restart.dnds.h5"):
254 restart_files.append(os.path.join(root, f))
255 assert len(restart_files) >= 1, (
256 f"No restart H5 files found in {search_dir} ({label})"
257 )
258 return restart_files[0]
259
260
261def _compare_restart_h5(restart_a, restart_b, tol=1e-10):
262 """Compare two restart H5 files by origIndex. Returns max_diff, rel_norm."""
263 orig_a, data_a = _read_h5_u_data(restart_a)
264 orig_b, data_b = _read_h5_u_data(restart_b)
265
266 orig_a_sorted, data_a_sorted = _gather_by_orig_index(orig_a, data_a)
267 orig_b_sorted, data_b_sorted = _gather_by_orig_index(orig_b, data_b)
268
269 assert np.array_equal(orig_a_sorted, orig_b_sorted), (
270 "Original index sets differ between restarts"
271 )
272
273 max_diff = np.max(np.abs(data_a_sorted - data_b_sorted))
274 rel_norm = np.linalg.norm(
275 data_a_sorted - data_b_sorted) / (np.linalg.norm(data_a_sorted) + 1e-300)
276
277 print(f" nGlobal cells: {len(orig_a_sorted)}")
278 print(f" nVars per cell: {data_a_sorted.shape[1]}")
279 print(f" Max abs diff: {max_diff:.6e}")
280 print(f" Rel L2 diff: {rel_norm:.6e}")
281
282 assert max_diff < tol, (
283 f"DOF data differs too much: max_diff={max_diff:.6e}, rel={rel_norm:.6e}"
284 )
285 return max_diff, rel_norm
286
287
288def _run_step1(work_dir, np_write, mesh_file=MESH_SMALL):
289 """Run step 1: initial 20-step run producing a restart."""
290 step1_config, _ = _make_step1_config(work_dir, mesh_file=mesh_file)
291 result, stdout = _run_solver(np_write, step1_config, work_dir)
292 assert result.returncode == 0, f"Step 1 solver failed:\n{stdout[-2000:]}"
293 restart_h5 = _find_restart_h5(os.path.join(work_dir, "step1"), "step1")
294 print(f" Restart file: {restart_h5}")
295 return restart_h5
296
297
298def _run_step2(work_dir, restart_h5, tag, np_count, overrides=None, mesh_file=MESH_SMALL):
299 """Run step 2: load restart and immediately write it back."""
300 step2_config, _ = _make_step2_config(
301 work_dir, restart_h5, tag, np_count, mesh_file=mesh_file)
302 result, stdout = _run_solver(
303 np_count, step2_config, work_dir, overrides=overrides)
304 assert result.returncode == 0, f"Step 2 ({tag}) solver failed:\n{stdout[-2000:]}"
305 restart_out = _find_restart_h5(os.path.join(work_dir, f"step2_{tag}"), tag)
306 print(f" Output restart: {restart_out}")
307 return restart_out
308
309
310# ---------------------------------------------------------------------------
311# Test: same np, different Metis partition
312# ---------------------------------------------------------------------------
313@pytest.fixture(scope="module")
314def work_dir():
315 """Create a temporary working directory for the test."""
316 d = tempfile.mkdtemp(prefix="dnds_test_redist_")
317 yield d
318 shutil.rmtree(d, ignore_errors=True)
319
320
321@pytest.fixture(scope="module")
323 """Create a temporary working directory for the large-mesh test."""
324 d = tempfile.mkdtemp(prefix="dnds_test_redist_large_")
325 yield d
326 shutil.rmtree(d, ignore_errors=True)
327
328
330 """Write restart at np=2, read at np=2 with different Metis seed."""
331 if not os.path.isfile(EULER_EXE):
332 pytest.skip(f"euler.exe not found: {EULER_EXE}")
333 if not os.path.isfile(MESH_SMALL):
334 pytest.skip(f"Mesh file not found: {MESH_SMALL}")
335
336 np_write = 2
337
338 print(f"\n=== Step 1: 20-step run with np={np_write} ===")
339 restart_h5 = _run_step1(work_dir, np_write, mesh_file=MESH_SMALL)
340
341 print(f"\n=== Step 2a: load with np=2, same partition (reference) ===")
342 restart_a = _run_step2(work_dir, restart_h5, "ref",
343 2, mesh_file=MESH_SMALL)
344
345 print(f"\n=== Step 2b: load with np=2, different Metis seed ===")
346 restart_b = _run_step2(
347 work_dir, restart_h5, "reseed", 2,
348 overrides=[("/dataIOControl/meshPartitionOptions/metisSeed", "42")],
349 mesh_file=MESH_SMALL,
350 )
351
352 print("\n=== Comparing: same-np redistribution ===")
353 _compare_restart_h5(restart_a, restart_b)
354 print("=== PASS ===")
355
356
358 """Write restart at np=2, read at np=3 (cross-np redistribution)."""
359 if not os.path.isfile(EULER_EXE):
360 pytest.skip(f"euler.exe not found: {EULER_EXE}")
361
362 # Reuse step1 restart from the same work_dir (written by previous test)
363 restart_h5 = _find_restart_h5(
364 os.path.join(work_dir, "step1"), "step1 (reuse)")
365 print(f"\n Reusing Step 1 restart: {restart_h5}")
366
367 restart_ref = _find_restart_h5(os.path.join(
368 work_dir, "step2_ref"), "ref (reuse)")
369
370 print(f"\n=== Step 2c: load with np=3 (cross-np redistribution) ===")
371 restart_c = _run_step2(work_dir, restart_h5, "np3",
372 3, mesh_file=MESH_SMALL)
373
374 print("\n=== Comparing: cross-np redistribution ===")
375 _compare_restart_h5(restart_ref, restart_c)
376 print("=== PASS ===")
377
378
380 """Write restart at np=4 with 20x20 mesh, read at np=4..8."""
381 if not os.path.isfile(EULER_EXE):
382 pytest.skip(f"euler.exe not found: {EULER_EXE}")
383 if not os.path.isfile(MESH_LARGE):
384 pytest.skip(f"Mesh file not found: {MESH_LARGE}")
385
386 np_write = 4
387
388 print(
389 f"\n=== Step 1: 20-step run with np={np_write}, mesh=IV10_20 (400 cells) ===")
390 restart_h5 = _run_step1(work_dir_large, np_write, mesh_file=MESH_LARGE)
391
392 print(f"\n=== Reference: load with np={np_write} ===")
393 restart_ref = _run_step2(work_dir_large, restart_h5,
394 "ref", np_write, mesh_file=MESH_LARGE)
395
396 for np_read in [4, 5, 6, 7, 8]:
397 tag = f"np{np_read}"
398 overrides = None
399 if np_read == np_write:
400 # Same np: use different Metis seed to force redistribution
401 overrides = [
402 ("/dataIOControl/meshPartitionOptions/metisSeed", "99")]
403 tag = f"np{np_read}_reseed"
404
405 print(f"\n=== Load with np={np_read} ({tag}) ===")
406 restart_out = _run_step2(
407 work_dir_large, restart_h5, tag, np_read,
408 overrides=overrides, mesh_file=MESH_LARGE,
409 )
410
411 print(f"=== Comparing np={np_write} vs np={np_read} ===")
412 _compare_restart_h5(restart_ref, restart_out)
413 print(f"=== PASS: np={np_read} ===")
414
415 print("\n=== ALL np=4..8 PASS ===")
416
417
418if __name__ == "__main__":
419 with tempfile.TemporaryDirectory(prefix="dnds_test_redist_") as d:
422 with tempfile.TemporaryDirectory(prefix="dnds_test_redist_large_") as d:
_run_solver(np_count, config_path, work_dir, overrides=None, timeout=300)
test_restart_redistribute_large_mesh_multi_np(work_dir_large)
_run_step1(work_dir, np_write, mesh_file=MESH_SMALL)
_make_step2_config(work_dir, restart_h5_path, tag, np_count, reorder_cells=False, mesh_bisect=0, mesh_file=MESH_SMALL)
_make_step1_config(work_dir, mesh_file=MESH_SMALL)
_compare_restart_h5(restart_a, restart_b, tol=1e-10)
_run_step2(work_dir, restart_h5, tag, np_count, overrides=None, mesh_file=MESH_SMALL)