DNDSR 0.1.0.dev1+gcd065ad
Distributed Numeric Data Structure for CFV
Loading...
Searching...
No Matches
test_restart_redistribute.py
Go to the documentation of this file.
1"""
2Functional test for ArrayPair redistribution via Euler solver restart.
3
4The test verifies that DOF data written in a restart file by one MPI partition
5can be correctly read by a different partition (different np), producing
6identical solver behaviour.
7
8Steps:
9 1. Run the Euler IV solver for 20 time steps with np=2, writing an H5 restart.
10 2. Load that restart and immediately write it back at np=2 (reference).
11 3. Load that restart and immediately write it back at np=3 (redistributed read).
12 4. Compare the two restart H5 files: the DOF data must match to machine precision.
13
14The file contains three test variants: same-np reseed, cross-np redistribution,
15and a large-mesh multi-np (np=4 → np=4..8) case.
16
17Usage:
18 pytest test/Euler/test_restart_redistribute.py -s
19 # or directly:
20 python test/Euler/test_restart_redistribute.py
21"""
22
23import json
24import os
25import re
26import shutil
27import subprocess
28import sys
29import tempfile
30
31import h5py
32import numpy as np
33import pytest
34
35# ---------------------------------------------------------------------------
36# Paths
37# ---------------------------------------------------------------------------
38SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
39PROJECT_ROOT = os.path.realpath(os.path.join(SCRIPT_DIR, "..", ".."))
40BUILD_DIR = os.path.join(PROJECT_ROOT, "build")
41EULER_EXE = os.path.join(BUILD_DIR, "app", "euler.exe")
42BASE_CONFIG = os.path.join(PROJECT_ROOT, "cases", "euler", "euler_config_IV.json")
43DEFAULT_CONFIG = os.path.join(PROJECT_ROOT, "cases", "euler", "euler_default_config.json")
44MESH_SMALL = os.path.join(PROJECT_ROOT, "data", "mesh", "IV10_10.cgns") # 10x10 = 100 cells
45MESH_LARGE = os.path.join(PROJECT_ROOT, "data", "mesh", "IV10_20.cgns") # 20x20 = 400 cells
46
47
49 """Remove // comments from JSON-with-comments (DNDS convention)."""
50 return re.sub(r"//.*", "", text)
51
52
53def _load_json(path):
54 with open(path) as f:
55 return json.loads(_strip_json_comments(f.read()))
56
57
58def _write_json(path, obj):
59 with open(path, "w") as f:
60 json.dump(obj, f, indent=4)
61
62
63def _run_solver(np_count, config_path, work_dir, overrides=None, timeout=300):
64 """Run the Euler solver via mpirun and return the process result."""
65 cmd = [
66 "mpirun", "--oversubscribe", "-np", str(np_count),
67 EULER_EXE, config_path,
68 ]
69 if overrides:
70 for k, v in overrides:
71 cmd.extend(["-k", k, "-v", v])
72
73 env = os.environ.copy()
74 result = subprocess.run(
75 cmd, cwd=work_dir, timeout=timeout,
76 stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
77 env=env,
78 )
79 stdout_text = result.stdout.decode("utf-8", errors="replace")
80 if result.returncode != 0:
81 print(f"=== SOLVER FAILED (np={np_count}) ===")
82 print(stdout_text[-4000:])
83 return result, stdout_text
84
85
86def _read_h5_u_data(h5_path):
87 """Read the DOF 'u' data from a restart H5 file.
88
89 Returns (origIndex, data) where:
90 - origIndex: 1-D array of original cell indices (global key)
91 - data: 2-D array of DOF values, shape (nGlobal, nVars)
92 """
93 with h5py.File(h5_path, "r") as f:
94 # The H5 layout from ArrayPair::WriteSerialize with origIndex:
95 # /u/origIndex -- flat array of original cell indices
96 # /u/father/data -- for plain ParArray
97 # /u/father/array/data -- for ArrayEigenMatrix (extra 'array' sub-group)
98 orig_idx = f["u"]["origIndex"][:]
99
100 father_grp = f["u"]["father"]
101 if "data" in father_grp:
102 raw_data = father_grp["data"][:]
103 elif "array" in father_grp and "data" in father_grp["array"]:
104 raw_data = father_grp["array"]["data"][:]
105 else:
106 raise KeyError(f"Cannot find data in {h5_path}: /u/father/ contains {list(father_grp.keys())}")
107
108 n_global = len(orig_idx)
109 assert raw_data.size % n_global == 0, (
110 f"data size {raw_data.size} not divisible by nGlobal {n_global}"
111 )
112 n_vars = raw_data.size // n_global
113 data = raw_data.reshape(n_global, n_vars)
114 return orig_idx, data
115
116
117def _gather_by_orig_index(orig_idx, data):
118 """Reorder data rows by original index so that global ordering is canonical."""
119 order = np.argsort(orig_idx)
120 return orig_idx[order], data[order]
121
122
123# ---------------------------------------------------------------------------
124# The actual test
125# ---------------------------------------------------------------------------
126@pytest.fixture(scope="module")
128 """Create a temporary working directory for the test."""
129 d = tempfile.mkdtemp(prefix="dnds_test_redist_")
130 yield d
131 # Cleanup after test (comment out for debugging)
132 shutil.rmtree(d, ignore_errors=True)
133
134
135def _make_step1_config(work_dir, mesh_file=MESH_SMALL):
136 """Config for the initial 20-step run (produces the restart)."""
137 cfg = _load_json(BASE_CONFIG)
138
139 # Time march: 20 steps, explicit ESDIRK4 (odeCode=0), small dt
140 cfg["timeMarchControl"]["nTimeStep"] = 20
141 cfg["timeMarchControl"]["tEnd"] = 1e10 # won't hit tEnd, use nTimeStep
142 cfg["timeMarchControl"]["dtImplicit"] = 1.25e-3
143 cfg["timeMarchControl"]["odeCode"] = 0
144 cfg["timeMarchControl"]["steadyQuit"] = False
145 cfg["timeMarchControl"]["useRestart"] = False
146
147 # Output: minimal
148 cfg["outputControl"]["nDataOut"] = 1000000
149 cfg["outputControl"]["nDataOutC"] = 1000000
150 cfg["outputControl"]["nConsoleCheck"] = 5
151 cfg["outputControl"]["dataOutAtInit"] = False
152 # Restart output every 20 steps (i.e. at the end)
153 cfg["outputControl"]["nRestartOut"] = 20
154 cfg["outputControl"]["nRestartOutC"] = 1000000
155
156 # Mesh
157 cfg["dataIOControl"]["meshFile"] = mesh_file
158 out_base = os.path.join(work_dir, "step1", "out")
159 os.makedirs(os.path.dirname(out_base), exist_ok=True)
160 cfg["dataIOControl"]["outPltName"] = out_base
161
162 # H5 restart writer
163 cfg["dataIOControl"]["restartWriter"] = {
164 "type": "H5",
165 "hdfDeflateLevel": 0,
166 "hdfChunkSize": 0,
167 "hdfCollOnMeta": True,
168 "hdfCollOnData": False,
169 "jsonBinaryDeflateLevel": 5,
170 "jsonUseCodecOnUInt8": True,
171 }
172
173 # Use 1st order for speed (maxOrder=1)
174 cfg["vfvSettings"]["maxOrder"] = 1
175 cfg["vfvSettings"]["intOrder"] = 1
176
177 # No bisection
178 cfg["dataIOControl"]["meshDirectBisect"] = 0
179 cfg["dataIOControl"]["meshReorderCells"] = False
180
181 path = os.path.join(work_dir, "step1_config.json")
182 _write_json(path, cfg)
183 return path, cfg
184
185
186def _make_step2_config(work_dir, restart_h5_path, tag, np_count,
187 reorder_cells=False, mesh_bisect=0, mesh_file=MESH_SMALL):
188 """Config for the 1-step restart run."""
189 cfg = _load_json(BASE_CONFIG)
190
191 # Time march: 0 steps from restart -- just load and write
192 cfg["timeMarchControl"]["nTimeStep"] = 0
193 cfg["timeMarchControl"]["tEnd"] = 1e10
194 cfg["timeMarchControl"]["dtImplicit"] = 1.25e-3
195 cfg["timeMarchControl"]["odeCode"] = 0
196 cfg["timeMarchControl"]["steadyQuit"] = False
197 cfg["timeMarchControl"]["useRestart"] = True
198
199 # Restart state: start from step 20
200 cfg["restartState"] = {
201 "iStep": 20,
202 "iStepInternal": -1,
203 "odeCodePrev": 0,
204 "lastRestartFile": restart_h5_path,
205 "otherRestartFile": "",
206 "otherRestartStoreDim": [0],
207 }
208
209 # Output: write restart immediately at init, no step-based output
210 cfg["outputControl"]["nDataOut"] = 1000000
211 cfg["outputControl"]["nDataOutC"] = 1000000
212 cfg["outputControl"]["nConsoleCheck"] = 1
213 cfg["outputControl"]["dataOutAtInit"] = False
214 cfg["outputControl"]["restartOutAtInit"] = True
215 cfg["outputControl"]["nRestartOut"] = 1000000
216 cfg["outputControl"]["nRestartOutC"] = 1000000
217
218 # Mesh
219 cfg["dataIOControl"]["meshFile"] = mesh_file
220 out_base = os.path.join(work_dir, f"step2_{tag}", "out")
221 os.makedirs(os.path.dirname(out_base), exist_ok=True)
222 cfg["dataIOControl"]["outPltName"] = out_base
223
224 # H5 restart writer
225 cfg["dataIOControl"]["restartWriter"] = {
226 "type": "H5",
227 "hdfDeflateLevel": 0,
228 "hdfChunkSize": 0,
229 "hdfCollOnMeta": True,
230 "hdfCollOnData": False,
231 "jsonBinaryDeflateLevel": 5,
232 "jsonUseCodecOnUInt8": True,
233 }
234
235 cfg["vfvSettings"]["maxOrder"] = 1
236 cfg["vfvSettings"]["intOrder"] = 1
237 cfg["dataIOControl"]["meshDirectBisect"] = mesh_bisect
238 cfg["dataIOControl"]["meshReorderCells"] = reorder_cells
239
240 path = os.path.join(work_dir, f"step2_{tag}_config.json")
241 _write_json(path, cfg)
242 return path, cfg
243
244
245def _find_restart_h5(search_dir, label=""):
246 """Find restart H5 files in directory tree."""
247 restart_files = []
248 for root, dirs, files in os.walk(search_dir):
249 for f in files:
250 if f.endswith(".restart.dnds.h5"):
251 restart_files.append(os.path.join(root, f))
252 assert len(restart_files) >= 1, (
253 f"No restart H5 files found in {search_dir} ({label})"
254 )
255 return restart_files[0]
256
257
258def _compare_restart_h5(restart_a, restart_b, tol=1e-10):
259 """Compare two restart H5 files by origIndex. Returns max_diff, rel_norm."""
260 orig_a, data_a = _read_h5_u_data(restart_a)
261 orig_b, data_b = _read_h5_u_data(restart_b)
262
263 orig_a_sorted, data_a_sorted = _gather_by_orig_index(orig_a, data_a)
264 orig_b_sorted, data_b_sorted = _gather_by_orig_index(orig_b, data_b)
265
266 assert np.array_equal(orig_a_sorted, orig_b_sorted), (
267 "Original index sets differ between restarts"
268 )
269
270 max_diff = np.max(np.abs(data_a_sorted - data_b_sorted))
271 rel_norm = np.linalg.norm(data_a_sorted - data_b_sorted) / (np.linalg.norm(data_a_sorted) + 1e-300)
272
273 print(f" nGlobal cells: {len(orig_a_sorted)}")
274 print(f" nVars per cell: {data_a_sorted.shape[1]}")
275 print(f" Max abs diff: {max_diff:.6e}")
276 print(f" Rel L2 diff: {rel_norm:.6e}")
277
278 assert max_diff < tol, (
279 f"DOF data differs too much: max_diff={max_diff:.6e}, rel={rel_norm:.6e}"
280 )
281 return max_diff, rel_norm
282
283
284def _run_step1(work_dir, np_write, mesh_file=MESH_SMALL):
285 """Run step 1: initial 20-step run producing a restart."""
286 step1_config, _ = _make_step1_config(work_dir, mesh_file=mesh_file)
287 shutil.copy(DEFAULT_CONFIG, os.path.join(work_dir, "euler_default_config.json"))
288 result, stdout = _run_solver(np_write, step1_config, work_dir)
289 assert result.returncode == 0, f"Step 1 solver failed:\n{stdout[-2000:]}"
290 restart_h5 = _find_restart_h5(os.path.join(work_dir, "step1"), "step1")
291 print(f" Restart file: {restart_h5}")
292 return restart_h5
293
294
295def _run_step2(work_dir, restart_h5, tag, np_count, overrides=None, mesh_file=MESH_SMALL):
296 """Run step 2: load restart and immediately write it back."""
297 step2_config, _ = _make_step2_config(work_dir, restart_h5, tag, np_count, mesh_file=mesh_file)
298 result, stdout = _run_solver(np_count, step2_config, work_dir, overrides=overrides)
299 assert result.returncode == 0, f"Step 2 ({tag}) solver failed:\n{stdout[-2000:]}"
300 restart_out = _find_restart_h5(os.path.join(work_dir, f"step2_{tag}"), tag)
301 print(f" Output restart: {restart_out}")
302 return restart_out
303
304
305# ---------------------------------------------------------------------------
306# Test: same np, different Metis partition
307# ---------------------------------------------------------------------------
308@pytest.fixture(scope="module")
309def work_dir():
310 """Create a temporary working directory for the test."""
311 d = tempfile.mkdtemp(prefix="dnds_test_redist_")
312 yield d
313 shutil.rmtree(d, ignore_errors=True)
314
315
316@pytest.fixture(scope="module")
318 """Create a temporary working directory for the large-mesh test."""
319 d = tempfile.mkdtemp(prefix="dnds_test_redist_large_")
320 yield d
321 shutil.rmtree(d, ignore_errors=True)
322
323
325 """Write restart at np=2, read at np=2 with different Metis seed."""
326 assert os.path.isfile(EULER_EXE), f"euler.exe not found: {EULER_EXE}"
327 assert os.path.isfile(MESH_SMALL), f"Mesh file not found: {MESH_SMALL}"
328
329 np_write = 2
330
331 print(f"\n=== Step 1: 20-step run with np={np_write} ===")
332 restart_h5 = _run_step1(work_dir, np_write, mesh_file=MESH_SMALL)
333
334 print(f"\n=== Step 2a: load with np=2, same partition (reference) ===")
335 restart_a = _run_step2(work_dir, restart_h5, "ref", 2, mesh_file=MESH_SMALL)
336
337 print(f"\n=== Step 2b: load with np=2, different Metis seed ===")
338 restart_b = _run_step2(
339 work_dir, restart_h5, "reseed", 2,
340 overrides=[("/dataIOControl/meshPartitionOptions/metisSeed", "42")],
341 mesh_file=MESH_SMALL,
342 )
343
344 print("\n=== Comparing: same-np redistribution ===")
345 _compare_restart_h5(restart_a, restart_b)
346 print("=== PASS ===")
347
348
350 """Write restart at np=2, read at np=3 (cross-np redistribution)."""
351 assert os.path.isfile(EULER_EXE), f"euler.exe not found: {EULER_EXE}"
352
353 # Reuse step1 restart from the same work_dir (written by previous test)
354 restart_h5 = _find_restart_h5(os.path.join(work_dir, "step1"), "step1 (reuse)")
355 print(f"\n Reusing Step 1 restart: {restart_h5}")
356
357 restart_ref = _find_restart_h5(os.path.join(work_dir, "step2_ref"), "ref (reuse)")
358
359 print(f"\n=== Step 2c: load with np=3 (cross-np redistribution) ===")
360 restart_c = _run_step2(work_dir, restart_h5, "np3", 3, mesh_file=MESH_SMALL)
361
362 print("\n=== Comparing: cross-np redistribution ===")
363 _compare_restart_h5(restart_ref, restart_c)
364 print("=== PASS ===")
365
366
368 """Write restart at np=4 with 20x20 mesh, read at np=4..8."""
369 assert os.path.isfile(EULER_EXE), f"euler.exe not found: {EULER_EXE}"
370 assert os.path.isfile(MESH_LARGE), f"Mesh file not found: {MESH_LARGE}"
371
372 np_write = 4
373
374 print(f"\n=== Step 1: 20-step run with np={np_write}, mesh=IV10_20 (400 cells) ===")
375 restart_h5 = _run_step1(work_dir_large, np_write, mesh_file=MESH_LARGE)
376
377 print(f"\n=== Reference: load with np={np_write} ===")
378 restart_ref = _run_step2(work_dir_large, restart_h5, "ref", np_write, mesh_file=MESH_LARGE)
379
380 for np_read in [4, 5, 6, 7, 8]:
381 tag = f"np{np_read}"
382 overrides = None
383 if np_read == np_write:
384 # Same np: use different Metis seed to force redistribution
385 overrides = [("/dataIOControl/meshPartitionOptions/metisSeed", "99")]
386 tag = f"np{np_read}_reseed"
387
388 print(f"\n=== Load with np={np_read} ({tag}) ===")
389 restart_out = _run_step2(
390 work_dir_large, restart_h5, tag, np_read,
391 overrides=overrides, mesh_file=MESH_LARGE,
392 )
393
394 print(f"=== Comparing np={np_write} vs np={np_read} ===")
395 _compare_restart_h5(restart_ref, restart_out)
396 print(f"=== PASS: np={np_read} ===")
397
398 print("\n=== ALL np=4..8 PASS ===")
399
400
401if __name__ == "__main__":
402 with tempfile.TemporaryDirectory(prefix="dnds_test_redist_") as d:
405 with tempfile.TemporaryDirectory(prefix="dnds_test_redist_large_") as d:
_run_solver(np_count, config_path, work_dir, overrides=None, timeout=300)
test_restart_redistribute_large_mesh_multi_np(work_dir_large)
_run_step1(work_dir, np_write, mesh_file=MESH_SMALL)
_make_step2_config(work_dir, restart_h5_path, tag, np_count, reorder_cells=False, mesh_bisect=0, mesh_file=MESH_SMALL)
_make_step1_config(work_dir, mesh_file=MESH_SMALL)
_compare_restart_h5(restart_a, restart_b, tol=1e-10)
_run_step2(work_dir, restart_h5, tag, np_count, overrides=None, mesh_file=MESH_SMALL)