@inproceedings{4c8acc423850463083ec6a518829e617,
title = "On the Detection of Silent Data Corruptions in HPC Applications Using Redundant Multi-threading",
abstract = "This paper studies the use of Redundant Multi-Threading (RMT) to detect Silent Data Corruptions in HPC applications. To understand if it can be a viable solution in an HPC context, we study two software optimizations to reduce RMT performance overhead by reducing the amount of data exchanged between the replicated threads. We conduct experiments with representative HPC workloads to measure the performance gains obtained through these optimizations, and the error detection coverage they achieve. In the best case, when running on a processor that features Simultaneous Multi-Threading, our results show that the overhead can be as low as 1.4 × without significantly reducing the ability to detect data corruptions.",
keywords = "HPC, Redundant multi-threading, Silent data corruptions",
author = "Diego P{\'e}rez and Thomas Ropars and Esteban Meneses",
note = "Publisher Copyright: {\textcopyright} 2021, Springer Nature Switzerland AG.; Workshops held at the 26th International Conference on Parallel and Distributed Computing, Euro-Par 2020 ; Conference date: 24-08-2020 Through 25-08-2020",
year = "2021",
doi = "10.1007/978-3-030-71593-9_23",
language = "Ingl{\'e}s",
isbn = "9783030715922",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "290--302",
editor = "Bartosz Balis and {B. Heras}, Dora and Laura Antonelli and Andrea Bracciali and Thomas Gruber and Jin Hyun-Wook and Michael Kuhn and Scott, {Stephen L.} and Didem Unat and Roman Wyrzykowski",
booktitle = "Euro-Par 2020",
}