CINXE.COM

Kapil Arya - Academia.edu

<!DOCTYPE html> <html lang="en" xmlns:fb="http://www.facebook.com/2008/fbml" class="wf-loading"> <head prefix="og: https://ogp.me/ns# fb: https://ogp.me/ns/fb# academia: https://ogp.me/ns/fb/academia#"> <meta charset="utf-8"> <meta name=viewport content="width=device-width, initial-scale=1"> <meta rel="search" type="application/opensearchdescription+xml" href="/open_search.xml" title="Academia.edu"> <title>Kapil Arya - Academia.edu</title> <!-- _ _ _ | | (_) | | __ _ ___ __ _ __| | ___ _ __ ___ _ __ _ ___ __| |_ _ / _` |/ __/ _` |/ _` |/ _ \ '_ ` _ \| |/ _` | / _ \/ _` | | | | | (_| | (_| (_| | (_| | __/ | | | | | | (_| || __/ (_| | |_| | \__,_|\___\__,_|\__,_|\___|_| |_| |_|_|\__,_(_)___|\__,_|\__,_| We're hiring! See https://www.academia.edu/hiring --> <link href="//a.academia-assets.com/images/favicons/favicon-production.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"> <link rel="apple-touch-icon" sizes="57x57" href="//a.academia-assets.com/images/favicons/apple-touch-icon-57x57.png"> <link rel="apple-touch-icon" sizes="60x60" href="//a.academia-assets.com/images/favicons/apple-touch-icon-60x60.png"> <link rel="apple-touch-icon" sizes="72x72" href="//a.academia-assets.com/images/favicons/apple-touch-icon-72x72.png"> <link rel="apple-touch-icon" sizes="76x76" href="//a.academia-assets.com/images/favicons/apple-touch-icon-76x76.png"> <link rel="apple-touch-icon" sizes="114x114" href="//a.academia-assets.com/images/favicons/apple-touch-icon-114x114.png"> <link rel="apple-touch-icon" sizes="120x120" href="//a.academia-assets.com/images/favicons/apple-touch-icon-120x120.png"> <link rel="apple-touch-icon" sizes="144x144" href="//a.academia-assets.com/images/favicons/apple-touch-icon-144x144.png"> <link rel="apple-touch-icon" sizes="152x152" href="//a.academia-assets.com/images/favicons/apple-touch-icon-152x152.png"> <link rel="apple-touch-icon" sizes="180x180" href="//a.academia-assets.com/images/favicons/apple-touch-icon-180x180.png"> <link rel="icon" type="image/png" href="//a.academia-assets.com/images/favicons/favicon-32x32.png" sizes="32x32"> <link rel="icon" type="image/png" href="//a.academia-assets.com/images/favicons/favicon-194x194.png" sizes="194x194"> <link rel="icon" type="image/png" href="//a.academia-assets.com/images/favicons/favicon-96x96.png" sizes="96x96"> <link rel="icon" type="image/png" href="//a.academia-assets.com/images/favicons/android-chrome-192x192.png" sizes="192x192"> <link rel="icon" type="image/png" href="//a.academia-assets.com/images/favicons/favicon-16x16.png" sizes="16x16"> <link rel="manifest" href="//a.academia-assets.com/images/favicons/manifest.json"> <meta name="msapplication-TileColor" content="#2b5797"> <meta name="msapplication-TileImage" content="//a.academia-assets.com/images/favicons/mstile-144x144.png"> <meta name="theme-color" content="#ffffff"> <script> window.performance && window.performance.measure && window.performance.measure("Time To First Byte", "requestStart", "responseStart"); </script> <script> (function() { if (!window.URLSearchParams || !window.history || !window.history.replaceState) { return; } var searchParams = new URLSearchParams(window.location.search); var paramsToDelete = [ 'fs', 'sm', 'swp', 'iid', 'nbs', 'rcc', // related content category 'rcpos', // related content carousel position 'rcpg', // related carousel page 'rchid', // related content hit id 'f_ri', // research interest id, for SEO tracking 'f_fri', // featured research interest, for SEO tracking (param key without value) 'f_rid', // from research interest directory for SEO tracking 'f_loswp', // from research interest pills on LOSWP sidebar for SEO tracking 'rhid', // referrring hit id ]; if (paramsToDelete.every((key) => searchParams.get(key) === null)) { return; } paramsToDelete.forEach((key) => { searchParams.delete(key); }); var cleanUrl = new URL(window.location.href); cleanUrl.search = searchParams.toString(); history.replaceState({}, document.title, cleanUrl); })(); </script> <script async src="https://www.googletagmanager.com/gtag/js?id=G-5VKX33P2DS"></script> <script> window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date()); gtag('config', 'G-5VKX33P2DS', { cookie_domain: 'academia.edu', send_page_view: false, }); gtag('event', 'page_view', { 'controller': "profiles/works", 'action': "summary", 'controller_action': 'profiles/works#summary', 'logged_in': 'false', 'edge': 'unknown', // Send nil if there is no A/B test bucket, in case some records get logged // with missing data - that way we can distinguish between the two cases. // ab_test_bucket should be of the form <ab_test_name>:<bucket> 'ab_test_bucket': null, }) </script> <script type="text/javascript"> window.sendUserTiming = function(timingName) { if (!(window.performance && window.performance.measure)) return; var entries = window.performance.getEntriesByName(timingName, "measure"); if (entries.length !== 1) return; var timingValue = Math.round(entries[0].duration); gtag('event', 'timing_complete', { name: timingName, value: timingValue, event_category: 'User-centric', }); }; window.sendUserTiming("Time To First Byte"); </script> <meta name="csrf-param" content="authenticity_token" /> <meta name="csrf-token" content="YXyLwqf0Hbf22at5_7pm6uCDyihJsncC2zjJzfdETlQ5hIxmOMVoAwf9-UtpDc0mICvZFeUSae3JCkJMZcd8_A" /> <link rel="stylesheet" href="//a.academia-assets.com/assets/wow-3d36c19b4875b226bfed0fcba1dcea3f2fe61148383d97c0465c016b8c969290.css" media="all" /><link rel="stylesheet" href="//a.academia-assets.com/assets/social/home-79e78ce59bef0a338eb6540ec3d93b4a7952115b56c57f1760943128f4544d42.css" media="all" /><link rel="stylesheet" href="//a.academia-assets.com/assets/single_work_page/figure_carousel-2004283e0948681916eefa74772df54f56cb5c7413d82b160212231c2f474bb3.css" media="all" /><script type="application/ld+json">{"@context":"https://schema.org","@type":"ProfilePage","mainEntity":{"@context":"https://schema.org","@type":"Person","name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2","image":"https://0.academia-photos.com/46423/83167/91049/s200_kapil.arya.jpg","sameAs":[]},"dateCreated":"2009-06-02T23:06:29-07:00","dateModified":"2024-02-20T11:29:23-08:00","name":"Kapil Arya","description":"","image":"https://0.academia-photos.com/46423/83167/91049/s200_kapil.arya.jpg","thumbnailUrl":"https://0.academia-photos.com/46423/83167/91049/s65_kapil.arya.jpg","primaryImageOfPage":{"@type":"ImageObject","url":"https://0.academia-photos.com/46423/83167/91049/s200_kapil.arya.jpg","width":200},"sameAs":[],"relatedLink":"https://www.academia.edu/117958233/Transparent_Checkpoint_Restart_over_InfiniBand"}</script><link rel="stylesheet" href="//a.academia-assets.com/assets/design_system/heading-95367dc03b794f6737f30123738a886cf53b7a65cdef98a922a98591d60063e3.css" media="all" /><link rel="stylesheet" href="//a.academia-assets.com/assets/design_system/button-8c9ae4b5c8a2531640c354d92a1f3579c8ff103277ef74913e34c8a76d4e6c00.css" media="all" /><link rel="stylesheet" href="//a.academia-assets.com/assets/design_system/body-170d1319f0e354621e81ca17054bb147da2856ec0702fe440a99af314a6338c5.css" media="all" /><link rel="stylesheet" href="//a.academia-assets.com/assets/single_work_page/figure_carousel-2004283e0948681916eefa74772df54f56cb5c7413d82b160212231c2f474bb3.css" media="all" /><style type="text/css">@media(max-width: 567px){:root{--token-mode: Parity;--dropshadow: 0 2px 4px 0 #22223340;--primary-brand: #0645b1;--error-dark: #b60000;--success-dark: #05b01c;--inactive-fill: #ebebee;--hover: #0c3b8d;--pressed: #082f75;--button-primary-fill-inactive: #ebebee;--button-primary-fill: #0645b1;--button-primary-text: #ffffff;--button-primary-fill-hover: #0c3b8d;--button-primary-fill-press: #082f75;--button-primary-icon: #ffffff;--button-primary-fill-inverse: #ffffff;--button-primary-text-inverse: #082f75;--button-primary-icon-inverse: #0645b1;--button-primary-fill-inverse-hover: #cddaef;--button-primary-stroke-inverse-pressed: #0645b1;--button-secondary-stroke-inactive: #b1b1ba;--button-secondary-fill: #eef2f9;--button-secondary-text: #082f75;--button-secondary-fill-press: #cddaef;--button-secondary-fill-inactive: #ebebee;--button-secondary-stroke: #cddaef;--button-secondary-stroke-hover: #386ac1;--button-secondary-stroke-press: #0645b1;--button-secondary-text-inactive: #b1b1ba;--button-secondary-icon: #082f75;--button-secondary-fill-hover: #e6ecf7;--button-secondary-stroke-inverse: #ffffff;--button-secondary-fill-inverse: rgba(255, 255, 255, 0);--button-secondary-icon-inverse: #ffffff;--button-secondary-icon-hover: #082f75;--button-secondary-icon-press: #082f75;--button-secondary-text-inverse: #ffffff;--button-secondary-text-hover: #082f75;--button-secondary-text-press: #082f75;--button-secondary-fill-inverse-hover: #043059;--button-xs-stroke: #141413;--button-xs-stroke-hover: #0c3b8d;--button-xs-stroke-press: #082f75;--button-xs-stroke-inactive: #ebebee;--button-xs-text: #141413;--button-xs-text-hover: #0c3b8d;--button-xs-text-press: #082f75;--button-xs-text-inactive: #91919e;--button-xs-icon: #141413;--button-xs-icon-hover: #0c3b8d;--button-xs-icon-press: #082f75;--button-xs-icon-inactive: #91919e;--button-xs-fill: #ffffff;--button-xs-fill-hover: #f4f7fc;--button-xs-fill-press: #eef2f9;--buttons-button-text-inactive: #91919e;--buttons-button-focus: #0645b1;--buttons-button-icon-inactive: #91919e;--buttons-small-buttons-corner-radius: 8px;--buttons-small-buttons-l-r-padding: 12px;--buttons-small-buttons-height: 44px;--buttons-small-buttons-gap: 8px;--buttons-small-buttons-icon-only-width: 44px;--buttons-small-buttons-icon-size: 20px;--buttons-small-buttons-stroke-default: 1px;--buttons-small-buttons-stroke-thick: 2px;--buttons-large-buttons-l-r-padding: 20px;--buttons-large-buttons-height: 54px;--buttons-large-buttons-icon-only-width: 54px;--buttons-large-buttons-icon-size: 20px;--buttons-large-buttons-gap: 8px;--buttons-large-buttons-corner-radius: 8px;--buttons-large-buttons-stroke-default: 1px;--buttons-large-buttons-stroke-thick: 2px;--buttons-extra-small-buttons-l-r-padding: 8px;--buttons-extra-small-buttons-height: 32px;--buttons-extra-small-buttons-icon-size: 16px;--buttons-extra-small-buttons-gap: 4px;--buttons-extra-small-buttons-corner-radius: 8px;--buttons-stroke-default: 1px;--buttons-stroke-thick: 2px;--background-beige: #f9f7f4;--error-light: #fff2f2;--text-placeholder: #6d6d7d;--stroke-dark: #141413;--stroke-light: #dddde2;--stroke-medium: #535366;--accent-green: #ccffd4;--accent-turquoise: #ccf7ff;--accent-yellow: #f7ffcc;--accent-peach: #ffd4cc;--accent-violet: #f7ccff;--accent-purple: #f4f7fc;--text-primary: #141413;--secondary-brand: #141413;--text-hover: #0c3b8d;--text-white: #ffffff;--text-link: #0645b1;--text-press: #082f75;--success-light: #f0f8f1;--background-light-blue: #eef2f9;--background-white: #ffffff;--premium-dark: #877440;--premium-light: #f9f6ed;--stroke-white: #ffffff;--inactive-content: #b1b1ba;--annotate-light: #a35dff;--annotate-dark: #824acc;--grid: #eef2f9;--inactive-stroke: #ebebee;--shadow: rgba(34, 34, 51, 0.25);--text-inactive: #6d6d7d;--text-error: #b60000;--stroke-error: #b60000;--background-error: #fff2f2;--background-black: #141413;--icon-default: #141413;--icon-blue: #0645b1;--background-grey: #dddde2;--icon-grey: #b1b1ba;--text-focus: #082f75;--brand-colors-neutral-black: #141413;--brand-colors-neutral-900: #535366;--brand-colors-neutral-800: #6d6d7d;--brand-colors-neutral-700: #91919e;--brand-colors-neutral-600: #b1b1ba;--brand-colors-neutral-500: #c8c8cf;--brand-colors-neutral-400: #dddde2;--brand-colors-neutral-300: #ebebee;--brand-colors-neutral-200: #f8f8fb;--brand-colors-neutral-100: #fafafa;--brand-colors-neutral-white: #ffffff;--brand-colors-blue-900: #043059;--brand-colors-blue-800: #082f75;--brand-colors-blue-700: #0c3b8d;--brand-colors-blue-600: #0645b1;--brand-colors-blue-500: #386ac1;--brand-colors-blue-400: #cddaef;--brand-colors-blue-300: #e6ecf7;--brand-colors-blue-200: #eef2f9;--brand-colors-blue-100: #f4f7fc;--brand-colors-gold-500: #877440;--brand-colors-gold-400: #e9e3d4;--brand-colors-gold-300: #f2efe8;--brand-colors-gold-200: #f9f6ed;--brand-colors-gold-100: #f9f7f4;--brand-colors-error-900: #920000;--brand-colors-error-500: #b60000;--brand-colors-success-900: #035c0f;--brand-colors-green: #ccffd4;--brand-colors-turquoise: #ccf7ff;--brand-colors-yellow: #f7ffcc;--brand-colors-peach: #ffd4cc;--brand-colors-violet: #f7ccff;--brand-colors-error-100: #fff2f2;--brand-colors-success-500: #05b01c;--brand-colors-success-100: #f0f8f1;--text-secondary: #535366;--icon-white: #ffffff;--background-beige-darker: #f2efe8;--icon-dark-grey: #535366;--type-font-family-sans-serif: Roboto;--type-font-family-serif: Georgia;--type-font-family-mono: IBM Plex Mono;--type-weights-300: 300;--type-weights-400: 400;--type-weights-500: 500;--type-weights-700: 700;--type-sizes-12: 12px;--type-sizes-14: 14px;--type-sizes-16: 16px;--type-sizes-18: 18px;--type-sizes-20: 20px;--type-sizes-22: 22px;--type-sizes-24: 24px;--type-sizes-28: 28px;--type-sizes-30: 30px;--type-sizes-32: 32px;--type-sizes-40: 40px;--type-sizes-42: 42px;--type-sizes-48-2: 48px;--type-line-heights-16: 16px;--type-line-heights-20: 20px;--type-line-heights-23: 23px;--type-line-heights-24: 24px;--type-line-heights-25: 25px;--type-line-heights-26: 26px;--type-line-heights-29: 29px;--type-line-heights-30: 30px;--type-line-heights-32: 32px;--type-line-heights-34: 34px;--type-line-heights-35: 35px;--type-line-heights-36: 36px;--type-line-heights-38: 38px;--type-line-heights-40: 40px;--type-line-heights-46: 46px;--type-line-heights-48: 48px;--type-line-heights-52: 52px;--type-line-heights-58: 58px;--type-line-heights-68: 68px;--type-line-heights-74: 74px;--type-line-heights-82: 82px;--type-paragraph-spacings-0: 0px;--type-paragraph-spacings-4: 4px;--type-paragraph-spacings-8: 8px;--type-paragraph-spacings-16: 16px;--type-sans-serif-xl-font-weight: 400;--type-sans-serif-xl-size: 32px;--type-sans-serif-xl-line-height: 46px;--type-sans-serif-xl-paragraph-spacing: 16px;--type-sans-serif-lg-font-weight: 400;--type-sans-serif-lg-size: 30px;--type-sans-serif-lg-line-height: 36px;--type-sans-serif-lg-paragraph-spacing: 16px;--type-sans-serif-md-font-weight: 400;--type-sans-serif-md-line-height: 30px;--type-sans-serif-md-paragraph-spacing: 16px;--type-sans-serif-md-size: 24px;--type-sans-serif-xs-font-weight: 700;--type-sans-serif-xs-line-height: 24px;--type-sans-serif-xs-paragraph-spacing: 0px;--type-sans-serif-xs-size: 18px;--type-sans-serif-sm-font-weight: 400;--type-sans-serif-sm-line-height: 32px;--type-sans-serif-sm-paragraph-spacing: 16px;--type-sans-serif-sm-size: 20px;--type-body-xl-font-weight: 400;--type-body-xl-size: 24px;--type-body-xl-line-height: 36px;--type-body-xl-paragraph-spacing: 0px;--type-body-sm-font-weight: 400;--type-body-sm-size: 14px;--type-body-sm-line-height: 20px;--type-body-sm-paragraph-spacing: 8px;--type-body-xs-font-weight: 400;--type-body-xs-size: 12px;--type-body-xs-line-height: 16px;--type-body-xs-paragraph-spacing: 0px;--type-body-md-font-weight: 400;--type-body-md-size: 16px;--type-body-md-line-height: 20px;--type-body-md-paragraph-spacing: 4px;--type-body-lg-font-weight: 400;--type-body-lg-size: 20px;--type-body-lg-line-height: 26px;--type-body-lg-paragraph-spacing: 16px;--type-body-lg-medium-font-weight: 500;--type-body-lg-medium-size: 20px;--type-body-lg-medium-line-height: 32px;--type-body-lg-medium-paragraph-spacing: 16px;--type-body-md-medium-font-weight: 500;--type-body-md-medium-size: 16px;--type-body-md-medium-line-height: 20px;--type-body-md-medium-paragraph-spacing: 4px;--type-body-sm-bold-font-weight: 700;--type-body-sm-bold-size: 14px;--type-body-sm-bold-line-height: 20px;--type-body-sm-bold-paragraph-spacing: 8px;--type-body-sm-medium-font-weight: 500;--type-body-sm-medium-size: 14px;--type-body-sm-medium-line-height: 20px;--type-body-sm-medium-paragraph-spacing: 8px;--type-serif-md-font-weight: 400;--type-serif-md-size: 32px;--type-serif-md-paragraph-spacing: 0px;--type-serif-md-line-height: 40px;--type-serif-sm-font-weight: 400;--type-serif-sm-size: 24px;--type-serif-sm-paragraph-spacing: 0px;--type-serif-sm-line-height: 26px;--type-serif-lg-font-weight: 400;--type-serif-lg-size: 48px;--type-serif-lg-paragraph-spacing: 0px;--type-serif-lg-line-height: 52px;--type-serif-xs-font-weight: 400;--type-serif-xs-size: 18px;--type-serif-xs-line-height: 24px;--type-serif-xs-paragraph-spacing: 0px;--type-serif-xl-font-weight: 400;--type-serif-xl-size: 48px;--type-serif-xl-paragraph-spacing: 0px;--type-serif-xl-line-height: 58px;--type-mono-md-font-weight: 400;--type-mono-md-size: 22px;--type-mono-md-line-height: 24px;--type-mono-md-paragraph-spacing: 0px;--type-mono-lg-font-weight: 400;--type-mono-lg-size: 40px;--type-mono-lg-line-height: 40px;--type-mono-lg-paragraph-spacing: 0px;--type-mono-sm-font-weight: 400;--type-mono-sm-size: 14px;--type-mono-sm-line-height: 24px;--type-mono-sm-paragraph-spacing: 0px;--spacing-xs-4: 4px;--spacing-xs-8: 8px;--spacing-xs-16: 16px;--spacing-sm-24: 24px;--spacing-sm-32: 32px;--spacing-md-40: 40px;--spacing-md-48: 48px;--spacing-lg-64: 64px;--spacing-lg-80: 80px;--spacing-xlg-104: 104px;--spacing-xlg-152: 152px;--spacing-xs-12: 12px;--spacing-page-section: 80px;--spacing-card-list-spacing: 48px;--spacing-text-section-spacing: 64px;--spacing-md-xs-headings: 40px;--corner-radius-radius-lg: 16px;--corner-radius-radius-sm: 4px;--corner-radius-radius-md: 8px;--corner-radius-radius-round: 104px}}@media(min-width: 568px)and (max-width: 1279px){:root{--token-mode: Parity;--dropshadow: 0 2px 4px 0 #22223340;--primary-brand: #0645b1;--error-dark: #b60000;--success-dark: #05b01c;--inactive-fill: #ebebee;--hover: #0c3b8d;--pressed: #082f75;--button-primary-fill-inactive: #ebebee;--button-primary-fill: #0645b1;--button-primary-text: #ffffff;--button-primary-fill-hover: #0c3b8d;--button-primary-fill-press: #082f75;--button-primary-icon: #ffffff;--button-primary-fill-inverse: #ffffff;--button-primary-text-inverse: #082f75;--button-primary-icon-inverse: #0645b1;--button-primary-fill-inverse-hover: #cddaef;--button-primary-stroke-inverse-pressed: #0645b1;--button-secondary-stroke-inactive: #b1b1ba;--button-secondary-fill: #eef2f9;--button-secondary-text: #082f75;--button-secondary-fill-press: #cddaef;--button-secondary-fill-inactive: #ebebee;--button-secondary-stroke: #cddaef;--button-secondary-stroke-hover: #386ac1;--button-secondary-stroke-press: #0645b1;--button-secondary-text-inactive: #b1b1ba;--button-secondary-icon: #082f75;--button-secondary-fill-hover: #e6ecf7;--button-secondary-stroke-inverse: #ffffff;--button-secondary-fill-inverse: rgba(255, 255, 255, 0);--button-secondary-icon-inverse: #ffffff;--button-secondary-icon-hover: #082f75;--button-secondary-icon-press: #082f75;--button-secondary-text-inverse: #ffffff;--button-secondary-text-hover: #082f75;--button-secondary-text-press: #082f75;--button-secondary-fill-inverse-hover: #043059;--button-xs-stroke: #141413;--button-xs-stroke-hover: #0c3b8d;--button-xs-stroke-press: #082f75;--button-xs-stroke-inactive: #ebebee;--button-xs-text: #141413;--button-xs-text-hover: #0c3b8d;--button-xs-text-press: #082f75;--button-xs-text-inactive: #91919e;--button-xs-icon: #141413;--button-xs-icon-hover: #0c3b8d;--button-xs-icon-press: #082f75;--button-xs-icon-inactive: #91919e;--button-xs-fill: #ffffff;--button-xs-fill-hover: #f4f7fc;--button-xs-fill-press: #eef2f9;--buttons-button-text-inactive: #91919e;--buttons-button-focus: #0645b1;--buttons-button-icon-inactive: #91919e;--buttons-small-buttons-corner-radius: 8px;--buttons-small-buttons-l-r-padding: 12px;--buttons-small-buttons-height: 44px;--buttons-small-buttons-gap: 8px;--buttons-small-buttons-icon-only-width: 44px;--buttons-small-buttons-icon-size: 20px;--buttons-small-buttons-stroke-default: 1px;--buttons-small-buttons-stroke-thick: 2px;--buttons-large-buttons-l-r-padding: 20px;--buttons-large-buttons-height: 54px;--buttons-large-buttons-icon-only-width: 54px;--buttons-large-buttons-icon-size: 20px;--buttons-large-buttons-gap: 8px;--buttons-large-buttons-corner-radius: 8px;--buttons-large-buttons-stroke-default: 1px;--buttons-large-buttons-stroke-thick: 2px;--buttons-extra-small-buttons-l-r-padding: 8px;--buttons-extra-small-buttons-height: 32px;--buttons-extra-small-buttons-icon-size: 16px;--buttons-extra-small-buttons-gap: 4px;--buttons-extra-small-buttons-corner-radius: 8px;--buttons-stroke-default: 1px;--buttons-stroke-thick: 2px;--background-beige: #f9f7f4;--error-light: #fff2f2;--text-placeholder: #6d6d7d;--stroke-dark: #141413;--stroke-light: #dddde2;--stroke-medium: #535366;--accent-green: #ccffd4;--accent-turquoise: #ccf7ff;--accent-yellow: #f7ffcc;--accent-peach: #ffd4cc;--accent-violet: #f7ccff;--accent-purple: #f4f7fc;--text-primary: #141413;--secondary-brand: #141413;--text-hover: #0c3b8d;--text-white: #ffffff;--text-link: #0645b1;--text-press: #082f75;--success-light: #f0f8f1;--background-light-blue: #eef2f9;--background-white: #ffffff;--premium-dark: #877440;--premium-light: #f9f6ed;--stroke-white: #ffffff;--inactive-content: #b1b1ba;--annotate-light: #a35dff;--annotate-dark: #824acc;--grid: #eef2f9;--inactive-stroke: #ebebee;--shadow: rgba(34, 34, 51, 0.25);--text-inactive: #6d6d7d;--text-error: #b60000;--stroke-error: #b60000;--background-error: #fff2f2;--background-black: #141413;--icon-default: #141413;--icon-blue: #0645b1;--background-grey: #dddde2;--icon-grey: #b1b1ba;--text-focus: #082f75;--brand-colors-neutral-black: #141413;--brand-colors-neutral-900: #535366;--brand-colors-neutral-800: #6d6d7d;--brand-colors-neutral-700: #91919e;--brand-colors-neutral-600: #b1b1ba;--brand-colors-neutral-500: #c8c8cf;--brand-colors-neutral-400: #dddde2;--brand-colors-neutral-300: #ebebee;--brand-colors-neutral-200: #f8f8fb;--brand-colors-neutral-100: #fafafa;--brand-colors-neutral-white: #ffffff;--brand-colors-blue-900: #043059;--brand-colors-blue-800: #082f75;--brand-colors-blue-700: #0c3b8d;--brand-colors-blue-600: #0645b1;--brand-colors-blue-500: #386ac1;--brand-colors-blue-400: #cddaef;--brand-colors-blue-300: #e6ecf7;--brand-colors-blue-200: #eef2f9;--brand-colors-blue-100: #f4f7fc;--brand-colors-gold-500: #877440;--brand-colors-gold-400: #e9e3d4;--brand-colors-gold-300: #f2efe8;--brand-colors-gold-200: #f9f6ed;--brand-colors-gold-100: #f9f7f4;--brand-colors-error-900: #920000;--brand-colors-error-500: #b60000;--brand-colors-success-900: #035c0f;--brand-colors-green: #ccffd4;--brand-colors-turquoise: #ccf7ff;--brand-colors-yellow: #f7ffcc;--brand-colors-peach: #ffd4cc;--brand-colors-violet: #f7ccff;--brand-colors-error-100: #fff2f2;--brand-colors-success-500: #05b01c;--brand-colors-success-100: #f0f8f1;--text-secondary: #535366;--icon-white: #ffffff;--background-beige-darker: #f2efe8;--icon-dark-grey: #535366;--type-font-family-sans-serif: Roboto;--type-font-family-serif: Georgia;--type-font-family-mono: IBM Plex Mono;--type-weights-300: 300;--type-weights-400: 400;--type-weights-500: 500;--type-weights-700: 700;--type-sizes-12: 12px;--type-sizes-14: 14px;--type-sizes-16: 16px;--type-sizes-18: 18px;--type-sizes-20: 20px;--type-sizes-22: 22px;--type-sizes-24: 24px;--type-sizes-28: 28px;--type-sizes-30: 30px;--type-sizes-32: 32px;--type-sizes-40: 40px;--type-sizes-42: 42px;--type-sizes-48-2: 48px;--type-line-heights-16: 16px;--type-line-heights-20: 20px;--type-line-heights-23: 23px;--type-line-heights-24: 24px;--type-line-heights-25: 25px;--type-line-heights-26: 26px;--type-line-heights-29: 29px;--type-line-heights-30: 30px;--type-line-heights-32: 32px;--type-line-heights-34: 34px;--type-line-heights-35: 35px;--type-line-heights-36: 36px;--type-line-heights-38: 38px;--type-line-heights-40: 40px;--type-line-heights-46: 46px;--type-line-heights-48: 48px;--type-line-heights-52: 52px;--type-line-heights-58: 58px;--type-line-heights-68: 68px;--type-line-heights-74: 74px;--type-line-heights-82: 82px;--type-paragraph-spacings-0: 0px;--type-paragraph-spacings-4: 4px;--type-paragraph-spacings-8: 8px;--type-paragraph-spacings-16: 16px;--type-sans-serif-xl-font-weight: 400;--type-sans-serif-xl-size: 42px;--type-sans-serif-xl-line-height: 46px;--type-sans-serif-xl-paragraph-spacing: 16px;--type-sans-serif-lg-font-weight: 400;--type-sans-serif-lg-size: 32px;--type-sans-serif-lg-line-height: 36px;--type-sans-serif-lg-paragraph-spacing: 16px;--type-sans-serif-md-font-weight: 400;--type-sans-serif-md-line-height: 34px;--type-sans-serif-md-paragraph-spacing: 16px;--type-sans-serif-md-size: 28px;--type-sans-serif-xs-font-weight: 700;--type-sans-serif-xs-line-height: 25px;--type-sans-serif-xs-paragraph-spacing: 0px;--type-sans-serif-xs-size: 20px;--type-sans-serif-sm-font-weight: 400;--type-sans-serif-sm-line-height: 30px;--type-sans-serif-sm-paragraph-spacing: 16px;--type-sans-serif-sm-size: 24px;--type-body-xl-font-weight: 400;--type-body-xl-size: 24px;--type-body-xl-line-height: 36px;--type-body-xl-paragraph-spacing: 0px;--type-body-sm-font-weight: 400;--type-body-sm-size: 14px;--type-body-sm-line-height: 20px;--type-body-sm-paragraph-spacing: 8px;--type-body-xs-font-weight: 400;--type-body-xs-size: 12px;--type-body-xs-line-height: 16px;--type-body-xs-paragraph-spacing: 0px;--type-body-md-font-weight: 400;--type-body-md-size: 16px;--type-body-md-line-height: 20px;--type-body-md-paragraph-spacing: 4px;--type-body-lg-font-weight: 400;--type-body-lg-size: 20px;--type-body-lg-line-height: 26px;--type-body-lg-paragraph-spacing: 16px;--type-body-lg-medium-font-weight: 500;--type-body-lg-medium-size: 20px;--type-body-lg-medium-line-height: 32px;--type-body-lg-medium-paragraph-spacing: 16px;--type-body-md-medium-font-weight: 500;--type-body-md-medium-size: 16px;--type-body-md-medium-line-height: 20px;--type-body-md-medium-paragraph-spacing: 4px;--type-body-sm-bold-font-weight: 700;--type-body-sm-bold-size: 14px;--type-body-sm-bold-line-height: 20px;--type-body-sm-bold-paragraph-spacing: 8px;--type-body-sm-medium-font-weight: 500;--type-body-sm-medium-size: 14px;--type-body-sm-medium-line-height: 20px;--type-body-sm-medium-paragraph-spacing: 8px;--type-serif-md-font-weight: 400;--type-serif-md-size: 40px;--type-serif-md-paragraph-spacing: 0px;--type-serif-md-line-height: 48px;--type-serif-sm-font-weight: 400;--type-serif-sm-size: 28px;--type-serif-sm-paragraph-spacing: 0px;--type-serif-sm-line-height: 32px;--type-serif-lg-font-weight: 400;--type-serif-lg-size: 58px;--type-serif-lg-paragraph-spacing: 0px;--type-serif-lg-line-height: 68px;--type-serif-xs-font-weight: 400;--type-serif-xs-size: 18px;--type-serif-xs-line-height: 24px;--type-serif-xs-paragraph-spacing: 0px;--type-serif-xl-font-weight: 400;--type-serif-xl-size: 74px;--type-serif-xl-paragraph-spacing: 0px;--type-serif-xl-line-height: 82px;--type-mono-md-font-weight: 400;--type-mono-md-size: 22px;--type-mono-md-line-height: 24px;--type-mono-md-paragraph-spacing: 0px;--type-mono-lg-font-weight: 400;--type-mono-lg-size: 40px;--type-mono-lg-line-height: 40px;--type-mono-lg-paragraph-spacing: 0px;--type-mono-sm-font-weight: 400;--type-mono-sm-size: 14px;--type-mono-sm-line-height: 24px;--type-mono-sm-paragraph-spacing: 0px;--spacing-xs-4: 4px;--spacing-xs-8: 8px;--spacing-xs-16: 16px;--spacing-sm-24: 24px;--spacing-sm-32: 32px;--spacing-md-40: 40px;--spacing-md-48: 48px;--spacing-lg-64: 64px;--spacing-lg-80: 80px;--spacing-xlg-104: 104px;--spacing-xlg-152: 152px;--spacing-xs-12: 12px;--spacing-page-section: 104px;--spacing-card-list-spacing: 48px;--spacing-text-section-spacing: 80px;--spacing-md-xs-headings: 40px;--corner-radius-radius-lg: 16px;--corner-radius-radius-sm: 4px;--corner-radius-radius-md: 8px;--corner-radius-radius-round: 104px}}@media(min-width: 1280px){:root{--token-mode: Parity;--dropshadow: 0 2px 4px 0 #22223340;--primary-brand: #0645b1;--error-dark: #b60000;--success-dark: #05b01c;--inactive-fill: #ebebee;--hover: #0c3b8d;--pressed: #082f75;--button-primary-fill-inactive: #ebebee;--button-primary-fill: #0645b1;--button-primary-text: #ffffff;--button-primary-fill-hover: #0c3b8d;--button-primary-fill-press: #082f75;--button-primary-icon: #ffffff;--button-primary-fill-inverse: #ffffff;--button-primary-text-inverse: #082f75;--button-primary-icon-inverse: #0645b1;--button-primary-fill-inverse-hover: #cddaef;--button-primary-stroke-inverse-pressed: #0645b1;--button-secondary-stroke-inactive: #b1b1ba;--button-secondary-fill: #eef2f9;--button-secondary-text: #082f75;--button-secondary-fill-press: #cddaef;--button-secondary-fill-inactive: #ebebee;--button-secondary-stroke: #cddaef;--button-secondary-stroke-hover: #386ac1;--button-secondary-stroke-press: #0645b1;--button-secondary-text-inactive: #b1b1ba;--button-secondary-icon: #082f75;--button-secondary-fill-hover: #e6ecf7;--button-secondary-stroke-inverse: #ffffff;--button-secondary-fill-inverse: rgba(255, 255, 255, 0);--button-secondary-icon-inverse: #ffffff;--button-secondary-icon-hover: #082f75;--button-secondary-icon-press: #082f75;--button-secondary-text-inverse: #ffffff;--button-secondary-text-hover: #082f75;--button-secondary-text-press: #082f75;--button-secondary-fill-inverse-hover: #043059;--button-xs-stroke: #141413;--button-xs-stroke-hover: #0c3b8d;--button-xs-stroke-press: #082f75;--button-xs-stroke-inactive: #ebebee;--button-xs-text: #141413;--button-xs-text-hover: #0c3b8d;--button-xs-text-press: #082f75;--button-xs-text-inactive: #91919e;--button-xs-icon: #141413;--button-xs-icon-hover: #0c3b8d;--button-xs-icon-press: #082f75;--button-xs-icon-inactive: #91919e;--button-xs-fill: #ffffff;--button-xs-fill-hover: #f4f7fc;--button-xs-fill-press: #eef2f9;--buttons-button-text-inactive: #91919e;--buttons-button-focus: #0645b1;--buttons-button-icon-inactive: #91919e;--buttons-small-buttons-corner-radius: 8px;--buttons-small-buttons-l-r-padding: 12px;--buttons-small-buttons-height: 44px;--buttons-small-buttons-gap: 8px;--buttons-small-buttons-icon-only-width: 44px;--buttons-small-buttons-icon-size: 20px;--buttons-small-buttons-stroke-default: 1px;--buttons-small-buttons-stroke-thick: 2px;--buttons-large-buttons-l-r-padding: 20px;--buttons-large-buttons-height: 54px;--buttons-large-buttons-icon-only-width: 54px;--buttons-large-buttons-icon-size: 20px;--buttons-large-buttons-gap: 8px;--buttons-large-buttons-corner-radius: 8px;--buttons-large-buttons-stroke-default: 1px;--buttons-large-buttons-stroke-thick: 2px;--buttons-extra-small-buttons-l-r-padding: 8px;--buttons-extra-small-buttons-height: 32px;--buttons-extra-small-buttons-icon-size: 16px;--buttons-extra-small-buttons-gap: 4px;--buttons-extra-small-buttons-corner-radius: 8px;--buttons-stroke-default: 1px;--buttons-stroke-thick: 2px;--background-beige: #f9f7f4;--error-light: #fff2f2;--text-placeholder: #6d6d7d;--stroke-dark: #141413;--stroke-light: #dddde2;--stroke-medium: #535366;--accent-green: #ccffd4;--accent-turquoise: #ccf7ff;--accent-yellow: #f7ffcc;--accent-peach: #ffd4cc;--accent-violet: #f7ccff;--accent-purple: #f4f7fc;--text-primary: #141413;--secondary-brand: #141413;--text-hover: #0c3b8d;--text-white: #ffffff;--text-link: #0645b1;--text-press: #082f75;--success-light: #f0f8f1;--background-light-blue: #eef2f9;--background-white: #ffffff;--premium-dark: #877440;--premium-light: #f9f6ed;--stroke-white: #ffffff;--inactive-content: #b1b1ba;--annotate-light: #a35dff;--annotate-dark: #824acc;--grid: #eef2f9;--inactive-stroke: #ebebee;--shadow: rgba(34, 34, 51, 0.25);--text-inactive: #6d6d7d;--text-error: #b60000;--stroke-error: #b60000;--background-error: #fff2f2;--background-black: #141413;--icon-default: #141413;--icon-blue: #0645b1;--background-grey: #dddde2;--icon-grey: #b1b1ba;--text-focus: #082f75;--brand-colors-neutral-black: #141413;--brand-colors-neutral-900: #535366;--brand-colors-neutral-800: #6d6d7d;--brand-colors-neutral-700: #91919e;--brand-colors-neutral-600: #b1b1ba;--brand-colors-neutral-500: #c8c8cf;--brand-colors-neutral-400: #dddde2;--brand-colors-neutral-300: #ebebee;--brand-colors-neutral-200: #f8f8fb;--brand-colors-neutral-100: #fafafa;--brand-colors-neutral-white: #ffffff;--brand-colors-blue-900: #043059;--brand-colors-blue-800: #082f75;--brand-colors-blue-700: #0c3b8d;--brand-colors-blue-600: #0645b1;--brand-colors-blue-500: #386ac1;--brand-colors-blue-400: #cddaef;--brand-colors-blue-300: #e6ecf7;--brand-colors-blue-200: #eef2f9;--brand-colors-blue-100: #f4f7fc;--brand-colors-gold-500: #877440;--brand-colors-gold-400: #e9e3d4;--brand-colors-gold-300: #f2efe8;--brand-colors-gold-200: #f9f6ed;--brand-colors-gold-100: #f9f7f4;--brand-colors-error-900: #920000;--brand-colors-error-500: #b60000;--brand-colors-success-900: #035c0f;--brand-colors-green: #ccffd4;--brand-colors-turquoise: #ccf7ff;--brand-colors-yellow: #f7ffcc;--brand-colors-peach: #ffd4cc;--brand-colors-violet: #f7ccff;--brand-colors-error-100: #fff2f2;--brand-colors-success-500: #05b01c;--brand-colors-success-100: #f0f8f1;--text-secondary: #535366;--icon-white: #ffffff;--background-beige-darker: #f2efe8;--icon-dark-grey: #535366;--type-font-family-sans-serif: Roboto;--type-font-family-serif: Georgia;--type-font-family-mono: IBM Plex Mono;--type-weights-300: 300;--type-weights-400: 400;--type-weights-500: 500;--type-weights-700: 700;--type-sizes-12: 12px;--type-sizes-14: 14px;--type-sizes-16: 16px;--type-sizes-18: 18px;--type-sizes-20: 20px;--type-sizes-22: 22px;--type-sizes-24: 24px;--type-sizes-28: 28px;--type-sizes-30: 30px;--type-sizes-32: 32px;--type-sizes-40: 40px;--type-sizes-42: 42px;--type-sizes-48-2: 48px;--type-line-heights-16: 16px;--type-line-heights-20: 20px;--type-line-heights-23: 23px;--type-line-heights-24: 24px;--type-line-heights-25: 25px;--type-line-heights-26: 26px;--type-line-heights-29: 29px;--type-line-heights-30: 30px;--type-line-heights-32: 32px;--type-line-heights-34: 34px;--type-line-heights-35: 35px;--type-line-heights-36: 36px;--type-line-heights-38: 38px;--type-line-heights-40: 40px;--type-line-heights-46: 46px;--type-line-heights-48: 48px;--type-line-heights-52: 52px;--type-line-heights-58: 58px;--type-line-heights-68: 68px;--type-line-heights-74: 74px;--type-line-heights-82: 82px;--type-paragraph-spacings-0: 0px;--type-paragraph-spacings-4: 4px;--type-paragraph-spacings-8: 8px;--type-paragraph-spacings-16: 16px;--type-sans-serif-xl-font-weight: 400;--type-sans-serif-xl-size: 42px;--type-sans-serif-xl-line-height: 46px;--type-sans-serif-xl-paragraph-spacing: 16px;--type-sans-serif-lg-font-weight: 400;--type-sans-serif-lg-size: 32px;--type-sans-serif-lg-line-height: 38px;--type-sans-serif-lg-paragraph-spacing: 16px;--type-sans-serif-md-font-weight: 400;--type-sans-serif-md-line-height: 34px;--type-sans-serif-md-paragraph-spacing: 16px;--type-sans-serif-md-size: 28px;--type-sans-serif-xs-font-weight: 700;--type-sans-serif-xs-line-height: 25px;--type-sans-serif-xs-paragraph-spacing: 0px;--type-sans-serif-xs-size: 20px;--type-sans-serif-sm-font-weight: 400;--type-sans-serif-sm-line-height: 30px;--type-sans-serif-sm-paragraph-spacing: 16px;--type-sans-serif-sm-size: 24px;--type-body-xl-font-weight: 400;--type-body-xl-size: 24px;--type-body-xl-line-height: 36px;--type-body-xl-paragraph-spacing: 0px;--type-body-sm-font-weight: 400;--type-body-sm-size: 14px;--type-body-sm-line-height: 20px;--type-body-sm-paragraph-spacing: 8px;--type-body-xs-font-weight: 400;--type-body-xs-size: 12px;--type-body-xs-line-height: 16px;--type-body-xs-paragraph-spacing: 0px;--type-body-md-font-weight: 400;--type-body-md-size: 16px;--type-body-md-line-height: 20px;--type-body-md-paragraph-spacing: 4px;--type-body-lg-font-weight: 400;--type-body-lg-size: 20px;--type-body-lg-line-height: 26px;--type-body-lg-paragraph-spacing: 16px;--type-body-lg-medium-font-weight: 500;--type-body-lg-medium-size: 20px;--type-body-lg-medium-line-height: 32px;--type-body-lg-medium-paragraph-spacing: 16px;--type-body-md-medium-font-weight: 500;--type-body-md-medium-size: 16px;--type-body-md-medium-line-height: 20px;--type-body-md-medium-paragraph-spacing: 4px;--type-body-sm-bold-font-weight: 700;--type-body-sm-bold-size: 14px;--type-body-sm-bold-line-height: 20px;--type-body-sm-bold-paragraph-spacing: 8px;--type-body-sm-medium-font-weight: 500;--type-body-sm-medium-size: 14px;--type-body-sm-medium-line-height: 20px;--type-body-sm-medium-paragraph-spacing: 8px;--type-serif-md-font-weight: 400;--type-serif-md-size: 40px;--type-serif-md-paragraph-spacing: 0px;--type-serif-md-line-height: 48px;--type-serif-sm-font-weight: 400;--type-serif-sm-size: 28px;--type-serif-sm-paragraph-spacing: 0px;--type-serif-sm-line-height: 32px;--type-serif-lg-font-weight: 400;--type-serif-lg-size: 58px;--type-serif-lg-paragraph-spacing: 0px;--type-serif-lg-line-height: 68px;--type-serif-xs-font-weight: 400;--type-serif-xs-size: 18px;--type-serif-xs-line-height: 24px;--type-serif-xs-paragraph-spacing: 0px;--type-serif-xl-font-weight: 400;--type-serif-xl-size: 74px;--type-serif-xl-paragraph-spacing: 0px;--type-serif-xl-line-height: 82px;--type-mono-md-font-weight: 400;--type-mono-md-size: 22px;--type-mono-md-line-height: 24px;--type-mono-md-paragraph-spacing: 0px;--type-mono-lg-font-weight: 400;--type-mono-lg-size: 40px;--type-mono-lg-line-height: 40px;--type-mono-lg-paragraph-spacing: 0px;--type-mono-sm-font-weight: 400;--type-mono-sm-size: 14px;--type-mono-sm-line-height: 24px;--type-mono-sm-paragraph-spacing: 0px;--spacing-xs-4: 4px;--spacing-xs-8: 8px;--spacing-xs-16: 16px;--spacing-sm-24: 24px;--spacing-sm-32: 32px;--spacing-md-40: 40px;--spacing-md-48: 48px;--spacing-lg-64: 64px;--spacing-lg-80: 80px;--spacing-xlg-104: 104px;--spacing-xlg-152: 152px;--spacing-xs-12: 12px;--spacing-page-section: 152px;--spacing-card-list-spacing: 48px;--spacing-text-section-spacing: 80px;--spacing-md-xs-headings: 40px;--corner-radius-radius-lg: 16px;--corner-radius-radius-sm: 4px;--corner-radius-radius-md: 8px;--corner-radius-radius-round: 104px}}</style><link crossorigin="" href="https://fonts.gstatic.com/" rel="preconnect" /><link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,opsz,wght@0,9..40,100..1000;1,9..40,100..1000&amp;family=Gupter:wght@400;500;700&amp;family=IBM+Plex+Mono:wght@300;400&amp;family=Material+Symbols+Outlined:opsz,wght,FILL,GRAD@20,400,0,0&amp;display=swap" rel="stylesheet" /><link rel="stylesheet" href="//a.academia-assets.com/assets/design_system/common-57f9da13cef3fd4e2a8b655342c6488eded3e557e823fe67571f2ac77acd7b6f.css" media="all" /> <meta name="author" content="kapil arya" /> <meta name="description" content="Kapil Arya: 33 Followers, 28 Following, 36 Research papers. Research interests: Operating Systems, Parallel Computing, and High Performance Computing." /> <meta name="google-site-verification" content="bKJMBZA7E43xhDOopFZkssMMkBRjvYERV-NaN4R6mrs" /> <script> var $controller_name = 'works'; var $action_name = "summary"; var $rails_env = 'production'; var $app_rev = 'e8a18f05162f50362dc9ff94cb0bb49be84e4276'; var $domain = 'academia.edu'; var $app_host = "academia.edu"; var $asset_host = "academia-assets.com"; var $start_time = new Date().getTime(); var $recaptcha_key = "6LdxlRMTAAAAADnu_zyLhLg0YF9uACwz78shpjJB"; var $recaptcha_invisible_key = "6Lf3KHUUAAAAACggoMpmGJdQDtiyrjVlvGJ6BbAj"; var $disableClientRecordHit = false; </script> <script> window.Aedu = { hit_data: null }; window.Aedu.SiteStats = {"premium_universities_count":13895,"monthly_visitors":"31 million","monthly_visitor_count":31300000,"monthly_visitor_count_in_millions":31,"user_count":286271714,"paper_count":55203019,"paper_count_in_millions":55,"page_count":432000000,"page_count_in_millions":432,"pdf_count":16500000,"pdf_count_in_millions":16}; window.Aedu.serverRenderTime = new Date(1743677517000); window.Aedu.timeDifference = new Date().getTime() - 1743677517000; window.Aedu.isUsingCssV1 = false; window.Aedu.enableLocalization = true; window.Aedu.activateFullstory = false; window.Aedu.serviceAvailability = { status: {"attention_db":"on","bibliography_db":"on","contacts_db":"on","email_db":"on","indexability_db":"on","mentions_db":"on","news_db":"on","notifications_db":"on","offsite_mentions_db":"on","redshift":"on","redshift_exports_db":"on","related_works_db":"on","ring_db":"on","user_tests_db":"on"}, serviceEnabled: function(service) { return this.status[service] === "on"; }, readEnabled: function(service) { return this.serviceEnabled(service) || this.status[service] === "read_only"; }, }; window.Aedu.viewApmTrace = function() { // Check if x-apm-trace-id meta tag is set, and open the trace in APM // in a new window if it is. var apmTraceId = document.head.querySelector('meta[name="x-apm-trace-id"]'); if (apmTraceId) { var traceId = apmTraceId.content; // Use trace ID to construct URL, an example URL looks like: // https://app.datadoghq.com/apm/traces?query=trace_id%31298410148923562634 var apmUrl = 'https://app.datadoghq.com/apm/traces?query=trace_id%3A' + traceId; window.open(apmUrl, '_blank'); } }; </script> <!--[if lt IE 9]> <script src="//cdnjs.cloudflare.com/ajax/libs/html5shiv/3.7.2/html5shiv.min.js"></script> <![endif]--> <link href="https://fonts.googleapis.com/css?family=Roboto:100,100i,300,300i,400,400i,500,500i,700,700i,900,900i" rel="stylesheet"> <link rel="preload" href="//maxcdn.bootstrapcdn.com/font-awesome/4.3.0/css/font-awesome.min.css" as="style" onload="this.rel='stylesheet'"> <link rel="stylesheet" href="//a.academia-assets.com/assets/libraries-a9675dcb01ec4ef6aa807ba772c7a5a00c1820d3ff661c1038a20f80d06bb4e4.css" media="all" /> <link rel="stylesheet" href="//a.academia-assets.com/assets/academia-9982828ed1de4777566441c35ccf7157c55ca779141fce69380d727ebdbbb926.css" media="all" /> <link rel="stylesheet" href="//a.academia-assets.com/assets/design_system_legacy-056a9113b9a0f5343d013b29ee1929d5a18be35fdcdceb616600b4db8bd20054.css" media="all" /> <script src="//a.academia-assets.com/assets/webpack_bundles/runtime-bundle-005434038af4252ca37c527588411a3d6a0eabb5f727fac83f8bbe7fd88d93bb.js"></script> <script src="//a.academia-assets.com/assets/webpack_bundles/webpack_libraries_and_infrequently_changed.wjs-bundle-ea9e09e22b561126b0d4119ad33eee5d92cc3c2c850b903dfd540d5d5bbafa8f.js"></script> <script src="//a.academia-assets.com/assets/webpack_bundles/core_webpack.wjs-bundle-7619a748322c52a5dde35876bf9572375d489ce6dc0f5c94eadf71c265acf5fb.js"></script> <script src="//a.academia-assets.com/assets/webpack_bundles/sentry.wjs-bundle-5fe03fddca915c8ba0f7edbe64c194308e8ce5abaed7bffe1255ff37549c4808.js"></script> <script> jade = window.jade || {}; jade.helpers = window.$h; jade._ = window._; </script> <!-- Google Tag Manager --> <script id="tag-manager-head-root">(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= 'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer_old','GTM-5G9JF7Z');</script> <!-- End Google Tag Manager --> <script> window.gptadslots = []; window.googletag = window.googletag || {}; window.googletag.cmd = window.googletag.cmd || []; </script> <script type="text/javascript"> // TODO(jacob): This should be defined, may be rare load order problem. // Checking if null is just a quick fix, will default to en if unset. // Better fix is to run this immedietely after I18n is set. if (window.I18n != null) { I18n.defaultLocale = "en"; I18n.locale = "en"; I18n.fallbacks = true; } </script> <link rel="canonical" href="https://independent.academia.edu/KapilArya2" /> </head> <!--[if gte IE 9 ]> <body class='ie ie9 c-profiles/works a-summary logged_out'> <![endif]--> <!--[if !(IE) ]><!--> <body class='c-profiles/works a-summary logged_out'> <!--<![endif]--> <div id="fb-root"></div><script>window.fbAsyncInit = function() { FB.init({ appId: "2369844204", version: "v8.0", status: true, cookie: true, xfbml: true }); // Additional initialization code. if (window.InitFacebook) { // facebook.ts already loaded, set it up. window.InitFacebook(); } else { // Set a flag for facebook.ts to find when it loads. window.academiaAuthReadyFacebook = true; } };</script><script>window.fbAsyncLoad = function() { // Protection against double calling of this function if (window.FB) { return; } (function(d, s, id){ var js, fjs = d.getElementsByTagName(s)[0]; if (d.getElementById(id)) {return;} js = d.createElement(s); js.id = id; js.src = "//connect.facebook.net/en_US/sdk.js"; fjs.parentNode.insertBefore(js, fjs); }(document, 'script', 'facebook-jssdk')); } if (!window.defer_facebook) { // Autoload if not deferred window.fbAsyncLoad(); } else { // Defer loading by 5 seconds setTimeout(function() { window.fbAsyncLoad(); }, 5000); }</script> <div id="google-root"></div><script>window.loadGoogle = function() { if (window.InitGoogle) { // google.ts already loaded, set it up. window.InitGoogle("331998490334-rsn3chp12mbkiqhl6e7lu2q0mlbu0f1b"); } else { // Set a flag for google.ts to use when it loads. window.GoogleClientID = "331998490334-rsn3chp12mbkiqhl6e7lu2q0mlbu0f1b"; } };</script><script>window.googleAsyncLoad = function() { // Protection against double calling of this function (function(d) { var js; var id = 'google-jssdk'; var ref = d.getElementsByTagName('script')[0]; if (d.getElementById(id)) { return; } js = d.createElement('script'); js.id = id; js.async = true; js.onload = loadGoogle; js.src = "https://accounts.google.com/gsi/client" ref.parentNode.insertBefore(js, ref); }(document)); } if (!window.defer_google) { // Autoload if not deferred window.googleAsyncLoad(); } else { // Defer loading by 5 seconds setTimeout(function() { window.googleAsyncLoad(); }, 5000); }</script> <div id="tag-manager-body-root"> <!-- Google Tag Manager (noscript) --> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-5G9JF7Z" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript> <!-- End Google Tag Manager (noscript) --> <!-- Event listeners for analytics --> <script> window.addEventListener('load', function() { if (document.querySelector('input[name="commit"]')) { document.querySelector('input[name="commit"]').addEventListener('click', function() { gtag('event', 'click', { event_category: 'button', event_label: 'Log In' }) }) } }); </script> </div> <script>var _comscore = _comscore || []; _comscore.push({ c1: "2", c2: "26766707" }); (function() { var s = document.createElement("script"), el = document.getElementsByTagName("script")[0]; s.async = true; s.src = (document.location.protocol == "https:" ? "https://sb" : "http://b") + ".scorecardresearch.com/beacon.js"; el.parentNode.insertBefore(s, el); })();</script><img src="https://sb.scorecardresearch.com/p?c1=2&amp;c2=26766707&amp;cv=2.0&amp;cj=1" style="position: absolute; visibility: hidden" /> <div id='react-modal'></div> <div class='DesignSystem'> <a class='u-showOnFocus' href='#site'> Skip to main content </a> </div> <div id="upgrade_ie_banner" style="display: none;"><p>Academia.edu no longer supports Internet Explorer.</p><p>To browse Academia.edu and the wider internet faster and more securely, please take a few seconds to&nbsp;<a href="https://www.academia.edu/upgrade-browser">upgrade your browser</a>.</p></div><script>// Show this banner for all versions of IE if (!!window.MSInputMethodContext || /(MSIE)/.test(navigator.userAgent)) { document.getElementById('upgrade_ie_banner').style.display = 'block'; }</script> <div class="DesignSystem bootstrap ShrinkableNav"><div class="navbar navbar-default main-header"><div class="container-wrapper" id="main-header-container"><div class="container"><div class="navbar-header"><div class="nav-left-wrapper u-mt0x"><div class="nav-logo"><a data-main-header-link-target="logo_home" href="https://www.academia.edu/"><img class="visible-xs-inline-block" style="height: 24px;" alt="Academia.edu" src="//a.academia-assets.com/images/academia-logo-redesign-2015-A.svg" width="24" height="24" /><img width="145.2" height="18" class="hidden-xs" style="height: 24px;" alt="Academia.edu" src="//a.academia-assets.com/images/academia-logo-redesign-2015.svg" /></a></div><div class="nav-search"><div class="SiteSearch-wrapper select2-no-default-pills"><form class="js-SiteSearch-form DesignSystem" action="https://www.academia.edu/search" accept-charset="UTF-8" method="get"><i class="SiteSearch-icon fa fa-search u-fw700 u-positionAbsolute u-tcGrayDark"></i><input class="js-SiteSearch-form-input SiteSearch-form-input form-control" data-main-header-click-target="search_input" name="q" placeholder="Search" type="text" value="" /></form></div></div></div><div class="nav-right-wrapper pull-right"><ul class="NavLinks js-main-nav list-unstyled"><li class="NavLinks-link"><a class="js-header-login-url Button Button--inverseGray Button--sm u-mb4x" id="nav_log_in" rel="nofollow" href="https://www.academia.edu/login">Log In</a></li><li class="NavLinks-link u-p0x"><a class="Button Button--inverseGray Button--sm u-mb4x" rel="nofollow" href="https://www.academia.edu/signup">Sign Up</a></li></ul><button class="hidden-lg hidden-md hidden-sm u-ml4x navbar-toggle collapsed" data-target=".js-mobile-header-links" data-toggle="collapse" type="button"><span class="icon-bar"></span><span class="icon-bar"></span><span class="icon-bar"></span></button></div></div><div class="collapse navbar-collapse js-mobile-header-links"><ul class="nav navbar-nav"><li class="u-borderColorGrayLight u-borderBottom1"><a rel="nofollow" href="https://www.academia.edu/login">Log In</a></li><li class="u-borderColorGrayLight u-borderBottom1"><a rel="nofollow" href="https://www.academia.edu/signup">Sign Up</a></li><li class="u-borderColorGrayLight u-borderBottom1 js-mobile-nav-expand-trigger"><a href="#">more&nbsp<span class="caret"></span></a></li><li><ul class="js-mobile-nav-expand-section nav navbar-nav u-m0x collapse"><li class="u-borderColorGrayLight u-borderBottom1"><a rel="false" href="https://www.academia.edu/about">About</a></li><li class="u-borderColorGrayLight u-borderBottom1"><a rel="nofollow" href="https://www.academia.edu/press">Press</a></li><li class="u-borderColorGrayLight u-borderBottom1"><a rel="false" href="https://www.academia.edu/documents">Papers</a></li><li class="u-borderColorGrayLight u-borderBottom1"><a rel="nofollow" href="https://www.academia.edu/terms">Terms</a></li><li class="u-borderColorGrayLight u-borderBottom1"><a rel="nofollow" href="https://www.academia.edu/privacy">Privacy</a></li><li class="u-borderColorGrayLight u-borderBottom1"><a rel="nofollow" href="https://www.academia.edu/copyright">Copyright</a></li><li class="u-borderColorGrayLight u-borderBottom1"><a rel="nofollow" href="https://www.academia.edu/hiring"><i class="fa fa-briefcase"></i>&nbsp;We're Hiring!</a></li><li class="u-borderColorGrayLight u-borderBottom1"><a rel="nofollow" href="https://support.academia.edu/hc/en-us"><i class="fa fa-question-circle"></i>&nbsp;Help Center</a></li><li class="js-mobile-nav-collapse-trigger u-borderColorGrayLight u-borderBottom1 dropup" style="display:none"><a href="#">less&nbsp<span class="caret"></span></a></li></ul></li></ul></div></div></div><script>(function(){ var $moreLink = $(".js-mobile-nav-expand-trigger"); var $lessLink = $(".js-mobile-nav-collapse-trigger"); var $section = $('.js-mobile-nav-expand-section'); $moreLink.click(function(ev){ ev.preventDefault(); $moreLink.hide(); $lessLink.show(); $section.collapse('show'); }); $lessLink.click(function(ev){ ev.preventDefault(); $moreLink.show(); $lessLink.hide(); $section.collapse('hide'); }); })() if ($a.is_logged_in() || false) { new Aedu.NavigationController({ el: '.js-main-nav', showHighlightedNotification: false }); } else { $(".js-header-login-url").attr("href", $a.loginUrlWithRedirect()); } Aedu.autocompleteSearch = new AutocompleteSearch({el: '.js-SiteSearch-form'});</script></div></div> <div id='site' class='fixed'> <div id="content" class="clearfix"> <script>document.addEventListener('DOMContentLoaded', function(){ var $dismissible = $(".dismissible_banner"); $dismissible.click(function(ev) { $dismissible.hide(); }); });</script> <script src="//a.academia-assets.com/assets/webpack_bundles/profile.wjs-bundle-0d4749eb637d9acf3f125ef24206483a8378882ab36d57629c053436c6027b15.js" defer="defer"></script><script>$viewedUser = Aedu.User.set_viewed( {"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2","photo":"https://0.academia-photos.com/46423/83167/91049/s65_kapil.arya.jpg","has_photo":true,"is_analytics_public":false,"interests":[{"id":435,"name":"Operating Systems","url":"https://www.academia.edu/Documents/in/Operating_Systems"},{"id":442,"name":"Parallel Computing","url":"https://www.academia.edu/Documents/in/Parallel_Computing"},{"id":443,"name":"High Performance Computing","url":"https://www.academia.edu/Documents/in/High_Performance_Computing"},{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":202242,"name":"Checkpointing","url":"https://www.academia.edu/Documents/in/Checkpointing"}]} ); if ($a.is_logged_in() && $viewedUser.is_current_user()) { $('body').addClass('profile-viewed-by-owner'); } $socialProfiles = []</script><div id="js-react-on-rails-context" style="display:none" data-rails-context="{&quot;inMailer&quot;:false,&quot;i18nLocale&quot;:&quot;en&quot;,&quot;i18nDefaultLocale&quot;:&quot;en&quot;,&quot;href&quot;:&quot;https://independent.academia.edu/KapilArya2&quot;,&quot;location&quot;:&quot;/KapilArya2&quot;,&quot;scheme&quot;:&quot;https&quot;,&quot;host&quot;:&quot;independent.academia.edu&quot;,&quot;port&quot;:null,&quot;pathname&quot;:&quot;/KapilArya2&quot;,&quot;search&quot;:null,&quot;httpAcceptLanguage&quot;:null,&quot;serverSide&quot;:false}"></div> <div class="js-react-on-rails-component" style="display:none" data-component-name="ProfileCheckPaperUpdate" data-props="{}" data-trace="false" data-dom-id="ProfileCheckPaperUpdate-react-component-67f89ce9-fcf8-4e0e-9231-126c3188b40c"></div> <div id="ProfileCheckPaperUpdate-react-component-67f89ce9-fcf8-4e0e-9231-126c3188b40c"></div> <div class="DesignSystem"><div class="onsite-ping" id="onsite-ping"></div></div><div class="profile-user-info DesignSystem"><div class="social-profile-container"><div class="left-panel-container"><div class="user-info-component-wrapper"><div class="user-summary-cta-container"><div class="user-summary-container"><div class="social-profile-avatar-container"><img class="profile-avatar u-positionAbsolute" alt="Kapil Arya" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/46423/83167/91049/s200_kapil.arya.jpg" /></div><div class="title-container"><h1 class="ds2-5-heading-sans-serif-sm">Kapil Arya</h1><div class="affiliations-container fake-truncate js-profile-affiliations"><div><a class="u-tcGrayDarker" href="https://neu.academia.edu/">Northeastern University</a>, <a class="u-tcGrayDarker" href="https://neu.academia.edu/Departments/Computer_Science/Documents">Computer Science</a>, <span class="u-tcGrayDarker">Adjunct</span></div></div></div></div><div class="sidebar-cta-container"><button class="ds2-5-button hidden profile-cta-button grow js-profile-follow-button" data-broccoli-component="user-info.follow-button" data-click-track="profile-user-info-follow-button" data-follow-user-fname="Kapil" data-follow-user-id="46423" data-follow-user-source="profile_button" data-has-google="false"><span class="material-symbols-outlined" style="font-size: 20px" translate="no">add</span>Follow</button><button class="ds2-5-button hidden profile-cta-button grow js-profile-unfollow-button" data-broccoli-component="user-info.unfollow-button" data-click-track="profile-user-info-unfollow-button" data-unfollow-user-id="46423"><span class="material-symbols-outlined" style="font-size: 20px" translate="no">done</span>Following</button></div></div><div class="user-stats-container"><a><div class="stat-container js-profile-followers"><p class="label">Followers</p><p class="data">33</p></div></a><a><div class="stat-container js-profile-followees" data-broccoli-component="user-info.followees-count" data-click-track="profile-expand-user-info-following"><p class="label">Following</p><p class="data">28</p></div></a><a><div class="stat-container js-profile-coauthors" data-broccoli-component="user-info.coauthors-count" data-click-track="profile-expand-user-info-coauthors"><p class="label">Co-authors</p><p class="data">9</p></div></a><span><div class="stat-container"><p class="label"><span class="js-profile-total-view-text">Public Views</span></p><p class="data"><span class="js-profile-view-count"></span></p></div></span></div><div class="suggested-academics-container"><div class="suggested-academics--header"><h3 class="ds2-5-heading-sans-serif-xs">Related Authors</h3></div><ul class="suggested-user-card-list" data-nosnippet="true"><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://carleton-ca.academia.edu/dpleibovitz"><img class="profile-avatar u-positionAbsolute" alt="David Pierre Leibovitz related author profile picture" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/26128/78833/86073/s200_david.leibovitz.jpg" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://carleton-ca.academia.edu/dpleibovitz">David Pierre Leibovitz</a><p class="suggested-user-card__user-info__subheader ds2-5-body-xs">Carleton University</p></div></div><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://mansoura.academia.edu/QusayFHassan"><img class="profile-avatar u-positionAbsolute" alt="Qusay F. Hassan related author profile picture" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/328874/92010/134428315/s200_qusay.f._hassan.jpg" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://mansoura.academia.edu/QusayFHassan">Qusay F. Hassan</a></div></div><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://icf.academia.edu/ViorelChihaia"><img class="profile-avatar u-positionAbsolute" alt="Viorel Chihaia related author profile picture" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/337857/109328380/98566996/s200_viorel.chihaia.png" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://icf.academia.edu/ViorelChihaia">Viorel Chihaia</a><p class="suggested-user-card__user-info__subheader ds2-5-body-xs">Institute of Physical Chemistry &quot;Ilie Murgulescu&quot;</p></div></div><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://ncit.academia.edu/RoshanChitrakar"><img class="profile-avatar u-positionAbsolute" alt="Roshan Chitrakar related author profile picture" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/371695/9733675/15833098/s200_roshan.chitrakar.jpg" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://ncit.academia.edu/RoshanChitrakar">Roshan Chitrakar</a><p class="suggested-user-card__user-info__subheader ds2-5-body-xs">Nepal College of Information Technology</p></div></div><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://gc-cuny.academia.edu/LevManovich"><img class="profile-avatar u-positionAbsolute" alt="Lev Manovich related author profile picture" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/412778/130321/66062342/s200_lev.manovich.jpg" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://gc-cuny.academia.edu/LevManovich">Lev Manovich</a><p class="suggested-user-card__user-info__subheader ds2-5-body-xs">Graduate Center of the City University of New York</p></div></div><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://praxis.academia.edu/JaydipSen"><img class="profile-avatar u-positionAbsolute" alt="Jaydip Sen related author profile picture" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/1180344/1118482/2361285/s200_jaydip.sen.jpg" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://praxis.academia.edu/JaydipSen">Jaydip Sen</a><p class="suggested-user-card__user-info__subheader ds2-5-body-xs">Praxis Business School</p></div></div><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://newbulgarian.academia.edu/RossitzaGoleva"><img class="profile-avatar u-positionAbsolute" alt="Rossitza Goleva related author profile picture" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/1733476/1035531/1293690/s200_rossitza.goleva.jpg" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://newbulgarian.academia.edu/RossitzaGoleva">Rossitza Goleva</a><p class="suggested-user-card__user-info__subheader ds2-5-body-xs">New Bulgarian University</p></div></div><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://independent.academia.edu/ThinnThuNaing"><img class="profile-avatar u-positionAbsolute" alt="Thinn Thu Naing related author profile picture" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/3976693/1498522/1825224/s200_thinn_thu.naing.jpg" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://independent.academia.edu/ThinnThuNaing">Thinn Thu Naing</a></div></div><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://atauni.academia.edu/FerhatBozkurt"><img class="profile-avatar u-positionAbsolute" alt="Ferhat Bozkurt related author profile picture" border="0" onerror="if (this.src != &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;) this.src = &#39;//a.academia-assets.com/images/s200_no_pic.png&#39;;" width="200" height="200" src="https://0.academia-photos.com/10294204/75477342/118042527/s200_ferhat.bozkurt.jpg" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://atauni.academia.edu/FerhatBozkurt">Ferhat Bozkurt</a><p class="suggested-user-card__user-info__subheader ds2-5-body-xs">Ataturk University</p></div></div><div class="suggested-user-card"><div class="suggested-user-card__avatar social-profile-avatar-container"><a data-nosnippet="" href="https://ucsb.academia.edu/ForrestBrewer"><img class="profile-avatar u-positionAbsolute" alt="Forrest Brewer related author profile picture" border="0" src="//a.academia-assets.com/images/s200_no_pic.png" /></a></div><div class="suggested-user-card__user-info"><a class="suggested-user-card__user-info__header ds2-5-body-sm-bold ds2-5-body-link" href="https://ucsb.academia.edu/ForrestBrewer">Forrest Brewer</a><p class="suggested-user-card__user-info__subheader ds2-5-body-xs">University of California, Santa Barbara</p></div></div></ul></div><style type="text/css">.suggested-academics--header h3{font-size:16px;font-weight:500;line-height:20px}</style><div class="ri-section"><div class="ri-section-header"><span>Interests</span></div><div class="ri-tags-container"><a data-click-track="profile-user-info-expand-research-interests" data-has-card-for-ri-list="46423" href="https://www.academia.edu/Documents/in/Operating_Systems"><div id="js-react-on-rails-context" style="display:none" data-rails-context="{&quot;inMailer&quot;:false,&quot;i18nLocale&quot;:&quot;en&quot;,&quot;i18nDefaultLocale&quot;:&quot;en&quot;,&quot;href&quot;:&quot;https://independent.academia.edu/KapilArya2&quot;,&quot;location&quot;:&quot;/KapilArya2&quot;,&quot;scheme&quot;:&quot;https&quot;,&quot;host&quot;:&quot;independent.academia.edu&quot;,&quot;port&quot;:null,&quot;pathname&quot;:&quot;/KapilArya2&quot;,&quot;search&quot;:null,&quot;httpAcceptLanguage&quot;:null,&quot;serverSide&quot;:false}"></div> <div class="js-react-on-rails-component" style="display:none" data-component-name="Pill" data-props="{&quot;color&quot;:&quot;gray&quot;,&quot;children&quot;:[&quot;Operating Systems&quot;]}" data-trace="false" data-dom-id="Pill-react-component-7801c4d9-fb4e-464e-8076-cddd204041dc"></div> <div id="Pill-react-component-7801c4d9-fb4e-464e-8076-cddd204041dc"></div> </a><a data-click-track="profile-user-info-expand-research-interests" data-has-card-for-ri-list="46423" href="https://www.academia.edu/Documents/in/Parallel_Computing"><div class="js-react-on-rails-component" style="display:none" data-component-name="Pill" data-props="{&quot;color&quot;:&quot;gray&quot;,&quot;children&quot;:[&quot;Parallel Computing&quot;]}" data-trace="false" data-dom-id="Pill-react-component-0b9fe124-0824-470a-afa5-f504a88dc42e"></div> <div id="Pill-react-component-0b9fe124-0824-470a-afa5-f504a88dc42e"></div> </a><a data-click-track="profile-user-info-expand-research-interests" data-has-card-for-ri-list="46423" href="https://www.academia.edu/Documents/in/High_Performance_Computing"><div class="js-react-on-rails-component" style="display:none" data-component-name="Pill" data-props="{&quot;color&quot;:&quot;gray&quot;,&quot;children&quot;:[&quot;High Performance Computing&quot;]}" data-trace="false" data-dom-id="Pill-react-component-4f80873b-db20-4f2a-9202-75852576a311"></div> <div id="Pill-react-component-4f80873b-db20-4f2a-9202-75852576a311"></div> </a><a data-click-track="profile-user-info-expand-research-interests" data-has-card-for-ri-list="46423" href="https://www.academia.edu/Documents/in/Computer_Science"><div class="js-react-on-rails-component" style="display:none" data-component-name="Pill" data-props="{&quot;color&quot;:&quot;gray&quot;,&quot;children&quot;:[&quot;Computer Science&quot;]}" data-trace="false" data-dom-id="Pill-react-component-5bb26fab-b522-4d0d-b1fc-5b138815b5e4"></div> <div id="Pill-react-component-5bb26fab-b522-4d0d-b1fc-5b138815b5e4"></div> </a></div></div></div></div><div class="right-panel-container"><div class="user-content-wrapper"><div class="uploads-container" id="social-redesign-work-container"><div class="upload-header"><h2 class="ds2-5-heading-sans-serif-xs">Uploads</h2></div><div class="documents-container backbone-social-profile-documents" style="width: 100%;"><div class="u-taCenter"></div><div class="profile--tab_content_container js-tab-pane tab-pane active" id="all"><div class="profile--tab_heading_container js-section-heading" data-section="Papers" id="Papers"><h3 class="profile--tab_heading_container">Papers by Kapil Arya</h3></div><div class="js-work-strip profile--work_container" data-work-id="117958233"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/117958233/Transparent_Checkpoint_Restart_over_InfiniBand"><img alt="Research paper thumbnail of Transparent Checkpoint-Restart over InfiniBand" class="work-thumbnail" src="https://attachments.academia-assets.com/113694222/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/117958233/Transparent_Checkpoint_Restart_over_InfiniBand">Transparent Checkpoint-Restart over InfiniBand</a></div><div class="wp-workCard_item"><span>arXiv (Cornell University)</span><span>, Dec 13, 2013</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">InfiniBand is widely used for low-latency, high-throughput cluster computing. Saving the state of...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">InfiniBand is widely used for low-latency, high-throughput cluster computing. Saving the state of the InfiniBand network as part of distributed checkpointing has been a long-standing challenge for researchers. Because of a lack of a solution, typical MPI implementations have included custom checkpoint-restart services that &quot;tear down&quot; the network, checkpoint each node as if the node were a standalone computer, and then reconnect the network again. We present the first example of transparent, system-initiated checkpoint-restart that directly supports In-finiBand. The new approach is independent of any particular Linux kernel, thus simplifying the current practice of using a kernel-based module, such as BLCR. This direct approach results in checkpoints that are found to be faster than with the use of a checkpoint-restart service. The generality of this approach is shown not only by checkpointing an MPI computation, but also a native UPC computation (Berkeley Unified Parallel C), which does not use MPI. Scalability is shown by checkpointing 2,048 MPI processes across 128 nodes (with 16 cores per node). In addition, a cost-effective debugging approach is also enabled, in which a checkpoint image from an InfiniBand-based production cluster is copied to a local Ethernet-based cluster, where it can be restarted and an interactive debugger can be attached to it. This work is based on a plugin that extends the DMTCP (Distributed MultiThreaded CheckPointing) checkpoint-restart package.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="92e1eb3446b39a3dd645808d227703a8" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:113694222,&quot;asset_id&quot;:117958233,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/113694222/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="117958233"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="117958233"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 117958233; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=117958233]").text(description); $(".js-view-count[data-work-id=117958233]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 117958233; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='117958233']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "92e1eb3446b39a3dd645808d227703a8" } } $('.js-work-strip[data-work-id=117958233]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":117958233,"title":"Transparent Checkpoint-Restart over InfiniBand","translated_title":"","metadata":{"publisher":"Cornell University","grobid_abstract":"InfiniBand is widely used for low-latency, high-throughput cluster computing. Saving the state of the InfiniBand network as part of distributed checkpointing has been a long-standing challenge for researchers. Because of a lack of a solution, typical MPI implementations have included custom checkpoint-restart services that \"tear down\" the network, checkpoint each node as if the node were a standalone computer, and then reconnect the network again. We present the first example of transparent, system-initiated checkpoint-restart that directly supports In-finiBand. The new approach is independent of any particular Linux kernel, thus simplifying the current practice of using a kernel-based module, such as BLCR. This direct approach results in checkpoints that are found to be faster than with the use of a checkpoint-restart service. The generality of this approach is shown not only by checkpointing an MPI computation, but also a native UPC computation (Berkeley Unified Parallel C), which does not use MPI. Scalability is shown by checkpointing 2,048 MPI processes across 128 nodes (with 16 cores per node). In addition, a cost-effective debugging approach is also enabled, in which a checkpoint image from an InfiniBand-based production cluster is copied to a local Ethernet-based cluster, where it can be restarted and an interactive debugger can be attached to it. This work is based on a plugin that extends the DMTCP (Distributed MultiThreaded CheckPointing) checkpoint-restart package.","publication_date":{"day":13,"month":12,"year":2013,"errors":{}},"publication_name":"arXiv (Cornell University)","grobid_abstract_attachment_id":113694221},"translated_abstract":null,"internal_url":"https://www.academia.edu/117958233/Transparent_Checkpoint_Restart_over_InfiniBand","translated_internal_url":"","created_at":"2024-04-23T15:13:55.878-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":113694222,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694222/thumbnails/1.jpg","file_name":"1312.pdf","download_url":"https://www.academia.edu/attachments/113694222/download_file","bulk_download_file_name":"Transparent_Checkpoint_Restart_over_Infi.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694222/1312-libre.pdf?1713911126=\u0026response-content-disposition=attachment%3B+filename%3DTransparent_Checkpoint_Restart_over_Infi.pdf\u0026Expires=1743681116\u0026Signature=Vw5dKUWuajiZISQsaH40UrO8TgeuT3wDjIy2X85aK1ttBBlWCsYSz6KT49-AKj36APKzpX1-Ou-XaIy1EbRGptB82acZm-mte6y370-f-wLkCYfscbinVcx63SzHX69-jZ8fsvcpXgdlEBHKiuMdI0ZMZrmRFI4tG61bXtkD3O9bHh~wjTuQzBs81HF~IkaKY5LV3vwXpxS~qe5g5Mc3JC9PorDysfjhY3jYdj8eHVu2sos3sWPdQSxthiiG-6UoPudQM3mBmfqQJSWbgLgahXM3IOcz8uZDwiyFHg21zCZ4fi6BNRE--bto0U3Oq6Jas2764d0lIstEeiPa5GM05A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Transparent_Checkpoint_Restart_over_InfiniBand","translated_slug":"","page_count":22,"language":"en","content_type":"Work","summary":"InfiniBand is widely used for low-latency, high-throughput cluster computing. Saving the state of the InfiniBand network as part of distributed checkpointing has been a long-standing challenge for researchers. Because of a lack of a solution, typical MPI implementations have included custom checkpoint-restart services that \"tear down\" the network, checkpoint each node as if the node were a standalone computer, and then reconnect the network again. We present the first example of transparent, system-initiated checkpoint-restart that directly supports In-finiBand. The new approach is independent of any particular Linux kernel, thus simplifying the current practice of using a kernel-based module, such as BLCR. This direct approach results in checkpoints that are found to be faster than with the use of a checkpoint-restart service. The generality of this approach is shown not only by checkpointing an MPI computation, but also a native UPC computation (Berkeley Unified Parallel C), which does not use MPI. Scalability is shown by checkpointing 2,048 MPI processes across 128 nodes (with 16 cores per node). In addition, a cost-effective debugging approach is also enabled, in which a checkpoint image from an InfiniBand-based production cluster is copied to a local Ethernet-based cluster, where it can be restarted and an interactive debugger can be attached to it. This work is based on a plugin that extends the DMTCP (Distributed MultiThreaded CheckPointing) checkpoint-restart package.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":113694222,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694222/thumbnails/1.jpg","file_name":"1312.pdf","download_url":"https://www.academia.edu/attachments/113694222/download_file","bulk_download_file_name":"Transparent_Checkpoint_Restart_over_Infi.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694222/1312-libre.pdf?1713911126=\u0026response-content-disposition=attachment%3B+filename%3DTransparent_Checkpoint_Restart_over_Infi.pdf\u0026Expires=1743681116\u0026Signature=Vw5dKUWuajiZISQsaH40UrO8TgeuT3wDjIy2X85aK1ttBBlWCsYSz6KT49-AKj36APKzpX1-Ou-XaIy1EbRGptB82acZm-mte6y370-f-wLkCYfscbinVcx63SzHX69-jZ8fsvcpXgdlEBHKiuMdI0ZMZrmRFI4tG61bXtkD3O9bHh~wjTuQzBs81HF~IkaKY5LV3vwXpxS~qe5g5Mc3JC9PorDysfjhY3jYdj8eHVu2sos3sWPdQSxthiiG-6UoPudQM3mBmfqQJSWbgLgahXM3IOcz8uZDwiyFHg21zCZ4fi6BNRE--bto0U3Oq6Jas2764d0lIstEeiPa5GM05A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":113694221,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694221/thumbnails/1.jpg","file_name":"1312.pdf","download_url":"https://www.academia.edu/attachments/113694221/download_file","bulk_download_file_name":"Transparent_Checkpoint_Restart_over_Infi.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694221/1312-libre.pdf?1713911125=\u0026response-content-disposition=attachment%3B+filename%3DTransparent_Checkpoint_Restart_over_Infi.pdf\u0026Expires=1743681116\u0026Signature=YxeUWCf8xCVNpEsm~xB3WMypwacgzb79h1BC8l~tGuC1VhTqCT71Z8ySFIZauU6lClKzMY1XGPes26DKjWtJO4SGG6MLkOnRKq9Qiv-aWsLnddptRe0iakMuuSy8s5Rvb5-k3FVEFJBN5qju6TabN2hSl1HvMYSGrLTmQAMbNIDOdY5ByU9RLQN-7IErkryLs7DFB79yYR9SzIZ~xGxyqvIyo2rEKY5ONzk5Ehwq9qv2TiI~1p1xGa1Q-cK7WvzEmDQ8lQCAT~ewlVuTy9xSvMPhMgPi~MK1uA8i9ImqEjLAnuxgMa8Y9gABikE1VWArobdgKrETDUU6pmDw~xGrSQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":442,"name":"Parallel Computing","url":"https://www.academia.edu/Documents/in/Parallel_Computing"},{"id":44244,"name":"OPERATING SYSTEM","url":"https://www.academia.edu/Documents/in/OPERATING_SYSTEM"},{"id":75768,"name":"MPI","url":"https://www.academia.edu/Documents/in/MPI"},{"id":377043,"name":"Scalability","url":"https://www.academia.edu/Documents/in/Scalability"},{"id":491492,"name":"InfiniBand","url":"https://www.academia.edu/Documents/in/InfiniBand"},{"id":983490,"name":"Operating Systems (In Computer Science)","url":"https://www.academia.edu/Documents/in/Operating_Systems_In_Computer_Science_"},{"id":1188947,"name":"D","url":"https://www.academia.edu/Documents/in/D-351414216"}],"urls":[{"id":41341950,"url":"http://arxiv.org/pdf/1312.3938"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-117958233-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="117958232"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/117958232/Adapting_the_DMTCP_Plugin_Model_for_Checkpointing_of_Hardware_Emulation"><img alt="Research paper thumbnail of Adapting the DMTCP Plugin Model for Checkpointing of Hardware Emulation" class="work-thumbnail" src="https://attachments.academia-assets.com/113694220/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/117958232/Adapting_the_DMTCP_Plugin_Model_for_Checkpointing_of_Hardware_Emulation">Adapting the DMTCP Plugin Model for Checkpointing of Hardware Emulation</a></div><div class="wp-workCard_item"><span>ArXiv</span><span>, 2017</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Checkpoint-restart is now a mature technology. It allows a user to save and later restore the sta...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Checkpoint-restart is now a mature technology. It allows a user to save and later restore the state of a running process. The new plugin model for the upcoming version 3.0 of DMTCP (Distributed MultiThreaded Checkpointing) is described here. This plugin model allows a target application to disconnect from the hardware emulator at checkpoint time and then re-connect to a possibly different hardware emulator at the time of restart. The DMTCP plugin model is important in allowing three distinct parties to seamlessly inter-operate. The three parties are: the EDA designer, who is concerned with formal verification of a circuit design; the DMTCP developers, who are concerned with providing transparent checkpointing during the circuit emulation; and the hardware emulator vendor, who provides a plugin library that responds to checkpoint, restart, and other events. The new plugin model is an example of process-level virtualization: virtualization of external abstractions from within a proces...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="677f230684aec6c8dfcd767d8bdf7399" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:113694220,&quot;asset_id&quot;:117958232,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/113694220/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="117958232"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="117958232"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 117958232; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=117958232]").text(description); $(".js-view-count[data-work-id=117958232]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 117958232; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='117958232']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "677f230684aec6c8dfcd767d8bdf7399" } } $('.js-work-strip[data-work-id=117958232]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":117958232,"title":"Adapting the DMTCP Plugin Model for Checkpointing of Hardware Emulation","translated_title":"","metadata":{"abstract":"Checkpoint-restart is now a mature technology. It allows a user to save and later restore the state of a running process. The new plugin model for the upcoming version 3.0 of DMTCP (Distributed MultiThreaded Checkpointing) is described here. This plugin model allows a target application to disconnect from the hardware emulator at checkpoint time and then re-connect to a possibly different hardware emulator at the time of restart. The DMTCP plugin model is important in allowing three distinct parties to seamlessly inter-operate. The three parties are: the EDA designer, who is concerned with formal verification of a circuit design; the DMTCP developers, who are concerned with providing transparent checkpointing during the circuit emulation; and the hardware emulator vendor, who provides a plugin library that responds to checkpoint, restart, and other events. The new plugin model is an example of process-level virtualization: virtualization of external abstractions from within a proces...","publisher":"ArXiv","ai_title_tag":"DMTCP Plugin Model for Hardware Emulation","publication_date":{"day":null,"month":null,"year":2017,"errors":{}},"publication_name":"ArXiv"},"translated_abstract":"Checkpoint-restart is now a mature technology. It allows a user to save and later restore the state of a running process. The new plugin model for the upcoming version 3.0 of DMTCP (Distributed MultiThreaded Checkpointing) is described here. This plugin model allows a target application to disconnect from the hardware emulator at checkpoint time and then re-connect to a possibly different hardware emulator at the time of restart. The DMTCP plugin model is important in allowing three distinct parties to seamlessly inter-operate. The three parties are: the EDA designer, who is concerned with formal verification of a circuit design; the DMTCP developers, who are concerned with providing transparent checkpointing during the circuit emulation; and the hardware emulator vendor, who provides a plugin library that responds to checkpoint, restart, and other events. The new plugin model is an example of process-level virtualization: virtualization of external abstractions from within a proces...","internal_url":"https://www.academia.edu/117958232/Adapting_the_DMTCP_Plugin_Model_for_Checkpointing_of_Hardware_Emulation","translated_internal_url":"","created_at":"2024-04-23T15:13:55.593-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":113694220,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694220/thumbnails/1.jpg","file_name":"1703.00897v1.pdf","download_url":"https://www.academia.edu/attachments/113694220/download_file","bulk_download_file_name":"Adapting_the_DMTCP_Plugin_Model_for_Chec.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694220/1703.00897v1-libre.pdf?1713911121=\u0026response-content-disposition=attachment%3B+filename%3DAdapting_the_DMTCP_Plugin_Model_for_Chec.pdf\u0026Expires=1743681116\u0026Signature=TPCYVG37lq2mnb15OKVsC7YgZSS92NjA7eloh2UT4QWMS70QcLMCXQJCQ4A5y00HWu9yr~gQBf75-mcICbIt5NLXgWDm53-b4oXAkYO1PdvXkLnMgFR4gYyqtQ54EJN4EqdvR-C5mHAduIDW7q5b6rJAnwx8~f8dm9pKCXMbyV3h9S2hhxVUK75in3QO71loUu2UENs3ZSGqfSrCBBv4oKo93WTB6onMM-zUdDCcWj~Tnl9XFnBxIvEiLXEvegPDvyEPiNfRxtFhET3B5pwCLwvKdcPZG8LAtim-kiBe-K3vcKhdvXMdqY2u6SwOXEsdLvZiag1vwTOoLQhOEOG7Vw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Adapting_the_DMTCP_Plugin_Model_for_Checkpointing_of_Hardware_Emulation","translated_slug":"","page_count":6,"language":"en","content_type":"Work","summary":"Checkpoint-restart is now a mature technology. It allows a user to save and later restore the state of a running process. The new plugin model for the upcoming version 3.0 of DMTCP (Distributed MultiThreaded Checkpointing) is described here. This plugin model allows a target application to disconnect from the hardware emulator at checkpoint time and then re-connect to a possibly different hardware emulator at the time of restart. The DMTCP plugin model is important in allowing three distinct parties to seamlessly inter-operate. The three parties are: the EDA designer, who is concerned with formal verification of a circuit design; the DMTCP developers, who are concerned with providing transparent checkpointing during the circuit emulation; and the hardware emulator vendor, who provides a plugin library that responds to checkpoint, restart, and other events. The new plugin model is an example of process-level virtualization: virtualization of external abstractions from within a proces...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":113694220,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694220/thumbnails/1.jpg","file_name":"1703.00897v1.pdf","download_url":"https://www.academia.edu/attachments/113694220/download_file","bulk_download_file_name":"Adapting_the_DMTCP_Plugin_Model_for_Chec.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694220/1703.00897v1-libre.pdf?1713911121=\u0026response-content-disposition=attachment%3B+filename%3DAdapting_the_DMTCP_Plugin_Model_for_Chec.pdf\u0026Expires=1743681116\u0026Signature=TPCYVG37lq2mnb15OKVsC7YgZSS92NjA7eloh2UT4QWMS70QcLMCXQJCQ4A5y00HWu9yr~gQBf75-mcICbIt5NLXgWDm53-b4oXAkYO1PdvXkLnMgFR4gYyqtQ54EJN4EqdvR-C5mHAduIDW7q5b6rJAnwx8~f8dm9pKCXMbyV3h9S2hhxVUK75in3QO71loUu2UENs3ZSGqfSrCBBv4oKo93WTB6onMM-zUdDCcWj~Tnl9XFnBxIvEiLXEvegPDvyEPiNfRxtFhET3B5pwCLwvKdcPZG8LAtim-kiBe-K3vcKhdvXMdqY2u6SwOXEsdLvZiag1vwTOoLQhOEOG7Vw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":113694219,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694219/thumbnails/1.jpg","file_name":"1703.00897v1.pdf","download_url":"https://www.academia.edu/attachments/113694219/download_file","bulk_download_file_name":"Adapting_the_DMTCP_Plugin_Model_for_Chec.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694219/1703.00897v1-libre.pdf?1713911119=\u0026response-content-disposition=attachment%3B+filename%3DAdapting_the_DMTCP_Plugin_Model_for_Chec.pdf\u0026Expires=1743681116\u0026Signature=YIkwMCN5cNbC51e7shniWopfbX3uIXQJigZQSMq0~RqVZ4EExQPa2O-xJkrZklVjXQsB8sExnIwEuigvmfOFXVbriflnpzR3GTvE2CkVQbxKOyp2a5vBO5l5X~XEaCnsebstSCt1oHwB9VbyO6IIU-zIndjHexF5e8wPF6oG~UIqqauiQgaygFienDLcKtFkyeCAZ9~A5LappD5Y~ksoyCQLoqUQAklPY8~ZOu~mrDk5lacRxYVXpDQtjLZTkK3ZvR73fKFfKziunGHFBvGphMDjrHhmNCEefE30MuF3Wl741GU5wTwhodG8tpDmjaXXAxUDqifym~kEqgydhDiY1A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":44244,"name":"OPERATING SYSTEM","url":"https://www.academia.edu/Documents/in/OPERATING_SYSTEM"},{"id":154848,"name":"Emulation","url":"https://www.academia.edu/Documents/in/Emulation"},{"id":1148030,"name":"Embedded System","url":"https://www.academia.edu/Documents/in/Embedded_System"},{"id":3193313,"name":"arXiv","url":"https://www.academia.edu/Documents/in/arXiv"}],"urls":[{"id":41341949,"url":"https://arxiv.org/pdf/1703.00897v1.pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-117958232-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="117958231"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/117958231/DMTCP_Scalable_User_Level_Transparent_Checkpointing_for_Cluster_Computations"><img alt="Research paper thumbnail of DMTCP: Scalable User-Level Transparent Checkpointing for Cluster Computations" class="work-thumbnail" src="https://attachments.academia-assets.com/113694238/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/117958231/DMTCP_Scalable_User_Level_Transparent_Checkpointing_for_Cluster_Computations">DMTCP: Scalable User-Level Transparent Checkpointing for Cluster Computations</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">As the size of clusters increases, failures are becoming increasingly frequent. Applications must...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">As the size of clusters increases, failures are becoming increasingly frequent. Applications must become fault tolerant if they are to run for extended periods of time. We present DMTCP (Distributed MultiThreaded CheckPointing), the first user-level distributed checkpointing package not dependent on a specific message passing library. This contrasts with existing approaches either specific to libraries such as MPI or requiring kernel modification. DMTCP provides fault tolerance through checkpointing. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads. DMTCP automatically accounts for TCP/IP sockets, UNIX domain sockets, pipes, ptys (pseudo-terminals), signal handlers, ordinary file descriptors, shared file descriptors, and other operating system artifacts. We demonstrate checkpointing and restart of applications communicating through MPICH2, OpenMPI, and sockets directly. These applications were written with a variety of languages including Fortran, C, C++, and Python. Results show that checkpoint time remains nearly constant as the number of nodes increases.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="17b902baca41113359321f78cf3d844d" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:113694238,&quot;asset_id&quot;:117958231,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/113694238/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="117958231"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="117958231"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 117958231; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=117958231]").text(description); $(".js-view-count[data-work-id=117958231]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 117958231; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='117958231']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "17b902baca41113359321f78cf3d844d" } } $('.js-work-strip[data-work-id=117958231]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":117958231,"title":"DMTCP: Scalable User-Level Transparent Checkpointing for Cluster Computations","translated_title":"","metadata":{"ai_title_tag":"DMTCP: Scalable Checkpointing for Clusters","grobid_abstract":"As the size of clusters increases, failures are becoming increasingly frequent. Applications must become fault tolerant if they are to run for extended periods of time. We present DMTCP (Distributed MultiThreaded CheckPointing), the first user-level distributed checkpointing package not dependent on a specific message passing library. This contrasts with existing approaches either specific to libraries such as MPI or requiring kernel modification. DMTCP provides fault tolerance through checkpointing. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads. DMTCP automatically accounts for TCP/IP sockets, UNIX domain sockets, pipes, ptys (pseudo-terminals), signal handlers, ordinary file descriptors, shared file descriptors, and other operating system artifacts. We demonstrate checkpointing and restart of applications communicating through MPICH2, OpenMPI, and sockets directly. These applications were written with a variety of languages including Fortran, C, C++, and Python. Results show that checkpoint time remains nearly constant as the number of nodes increases.","publication_date":{"day":null,"month":null,"year":2008,"errors":{}},"grobid_abstract_attachment_id":113694238},"translated_abstract":null,"internal_url":"https://www.academia.edu/117958231/DMTCP_Scalable_User_Level_Transparent_Checkpointing_for_Cluster_Computations","translated_internal_url":"","created_at":"2024-04-23T15:13:54.477-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":113694238,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694238/thumbnails/1.jpg","file_name":"download.pdf","download_url":"https://www.academia.edu/attachments/113694238/download_file","bulk_download_file_name":"DMTCP_Scalable_User_Level_Transparent_Ch.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694238/download-libre.pdf?1713911122=\u0026response-content-disposition=attachment%3B+filename%3DDMTCP_Scalable_User_Level_Transparent_Ch.pdf\u0026Expires=1743681116\u0026Signature=XqZOUB7Xxn6n7GlykWGIrZtFaHjHk2sBYjjZPehJSE9hp~FjLi3t4yQ7pYH~DINVF5TPICKic-gJFkfvmGLLSiRqt1HmHF8UIQ8SDaxrR42IiSs36a05LYK~uSaUZFARdegvpiDJVJRfdP~mqNTFVHLP18jgFk8cW3~LtNzof7wF1S--iglqGt~rhlG-zVlJlHAFxNaPprUeXtwBzw6X6Uj-ZMqyBCfaOzNzw0~d8g0DGJ4o09l6VWlmLkTQmB6~-dS~W4TYdWgWn89k86-748KVR-2sIHRjMpIHdwFmgHUiPmd7oBSY5kLlHrCpN-pQyvqfprqd~3EgInk760pngQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"DMTCP_Scalable_User_Level_Transparent_Checkpointing_for_Cluster_Computations","translated_slug":"","page_count":17,"language":"en","content_type":"Work","summary":"As the size of clusters increases, failures are becoming increasingly frequent. Applications must become fault tolerant if they are to run for extended periods of time. We present DMTCP (Distributed MultiThreaded CheckPointing), the first user-level distributed checkpointing package not dependent on a specific message passing library. This contrasts with existing approaches either specific to libraries such as MPI or requiring kernel modification. DMTCP provides fault tolerance through checkpointing. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads. DMTCP automatically accounts for TCP/IP sockets, UNIX domain sockets, pipes, ptys (pseudo-terminals), signal handlers, ordinary file descriptors, shared file descriptors, and other operating system artifacts. We demonstrate checkpointing and restart of applications communicating through MPICH2, OpenMPI, and sockets directly. These applications were written with a variety of languages including Fortran, C, C++, and Python. Results show that checkpoint time remains nearly constant as the number of nodes increases.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":113694238,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694238/thumbnails/1.jpg","file_name":"download.pdf","download_url":"https://www.academia.edu/attachments/113694238/download_file","bulk_download_file_name":"DMTCP_Scalable_User_Level_Transparent_Ch.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694238/download-libre.pdf?1713911122=\u0026response-content-disposition=attachment%3B+filename%3DDMTCP_Scalable_User_Level_Transparent_Ch.pdf\u0026Expires=1743681116\u0026Signature=XqZOUB7Xxn6n7GlykWGIrZtFaHjHk2sBYjjZPehJSE9hp~FjLi3t4yQ7pYH~DINVF5TPICKic-gJFkfvmGLLSiRqt1HmHF8UIQ8SDaxrR42IiSs36a05LYK~uSaUZFARdegvpiDJVJRfdP~mqNTFVHLP18jgFk8cW3~LtNzof7wF1S--iglqGt~rhlG-zVlJlHAFxNaPprUeXtwBzw6X6Uj-ZMqyBCfaOzNzw0~d8g0DGJ4o09l6VWlmLkTQmB6~-dS~W4TYdWgWn89k86-748KVR-2sIHRjMpIHdwFmgHUiPmd7oBSY5kLlHrCpN-pQyvqfprqd~3EgInk760pngQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":442,"name":"Parallel Computing","url":"https://www.academia.edu/Documents/in/Parallel_Computing"},{"id":8137,"name":"Unix","url":"https://www.academia.edu/Documents/in/Unix"},{"id":36300,"name":"Fault Tolerance","url":"https://www.academia.edu/Documents/in/Fault_Tolerance"},{"id":44244,"name":"OPERATING SYSTEM","url":"https://www.academia.edu/Documents/in/OPERATING_SYSTEM"},{"id":59487,"name":"Computation","url":"https://www.academia.edu/Documents/in/Computation"},{"id":377043,"name":"Scalability","url":"https://www.academia.edu/Documents/in/Scalability"},{"id":2038221,"name":"Computer Cluster","url":"https://www.academia.edu/Documents/in/Computer_Cluster"},{"id":3336228,"name":"python programming language","url":"https://www.academia.edu/Documents/in/python_programming_language"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-117958231-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="117958223"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/117958223/System_level_Scalable_Checkpoint_Restart_for_Petascale_Computing"><img alt="Research paper thumbnail of System-level Scalable Checkpoint-Restart for Petascale Computing" class="work-thumbnail" src="https://attachments.academia-assets.com/113694213/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/117958223/System_level_Scalable_Checkpoint_Restart_for_Petascale_Computing">System-level Scalable Checkpoint-Restart for Petascale Computing</a></div><div class="wp-workCard_item"><span>arXiv (Cornell University)</span><span>, Jul 27, 2016</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Fault tolerance for the upcoming exascale generation has long been an area of active research. On...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="1fb54398cf1659068c2e70608a9a44c4" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:113694213,&quot;asset_id&quot;:117958223,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/113694213/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="117958223"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="117958223"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 117958223; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=117958223]").text(description); $(".js-view-count[data-work-id=117958223]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 117958223; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='117958223']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "1fb54398cf1659068c2e70608a9a44c4" } } $('.js-work-strip[data-work-id=117958223]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":117958223,"title":"System-level Scalable Checkpoint-Restart for Petascale Computing","translated_title":"","metadata":{"publisher":"Cornell University","ai_title_tag":"Scalable Checkpoint-Restart for Petascale HPC","grobid_abstract":"Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.","publication_date":{"day":27,"month":7,"year":2016,"errors":{}},"publication_name":"arXiv (Cornell University)","grobid_abstract_attachment_id":113694213},"translated_abstract":null,"internal_url":"https://www.academia.edu/117958223/System_level_Scalable_Checkpoint_Restart_for_Petascale_Computing","translated_internal_url":"","created_at":"2024-04-23T15:13:23.548-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":113694213,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694213/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/113694213/download_file","bulk_download_file_name":"System_level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694213/1607-libre.pdf?1713911128=\u0026response-content-disposition=attachment%3B+filename%3DSystem_level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681116\u0026Signature=grFADiIVUOrEjyqtnQi7JgTjvIyGHwogxiRu3lm~C-8-rEwLiM~wke6tk2iee6bH99pZYRcK2oOh6f0M4LMdT1uwQyOVQzlwnmbHRcAP9i8uQUqA1I9xCXFIb7caG6VVoKADRecgY2CoNyr1ouGoR962Kb6SFNgQRtNsmWckN6iURRhvh3nFxrS2FK9Yfg2E37XEqO74ckOqZCErbafRBXomH8FbBcCe9~4hXLYKj3sWMqCseP69UE35tkMakMNz2UGTY4zfkMEXeB4~3jsKJqQ9ULsyFsqJcxFY8M3txOx-s2ynOjANK-WTKbAfRDYgykl4-vtHqE9JUuHpy8SNNw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"System_level_Scalable_Checkpoint_Restart_for_Petascale_Computing","translated_slug":"","page_count":18,"language":"en","content_type":"Work","summary":"Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":113694213,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694213/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/113694213/download_file","bulk_download_file_name":"System_level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694213/1607-libre.pdf?1713911128=\u0026response-content-disposition=attachment%3B+filename%3DSystem_level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681116\u0026Signature=grFADiIVUOrEjyqtnQi7JgTjvIyGHwogxiRu3lm~C-8-rEwLiM~wke6tk2iee6bH99pZYRcK2oOh6f0M4LMdT1uwQyOVQzlwnmbHRcAP9i8uQUqA1I9xCXFIb7caG6VVoKADRecgY2CoNyr1ouGoR962Kb6SFNgQRtNsmWckN6iURRhvh3nFxrS2FK9Yfg2E37XEqO74ckOqZCErbafRBXomH8FbBcCe9~4hXLYKj3sWMqCseP69UE35tkMakMNz2UGTY4zfkMEXeB4~3jsKJqQ9ULsyFsqJcxFY8M3txOx-s2ynOjANK-WTKbAfRDYgykl4-vtHqE9JUuHpy8SNNw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":113694212,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694212/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/113694212/download_file","bulk_download_file_name":"System_level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694212/1607-libre.pdf?1713911123=\u0026response-content-disposition=attachment%3B+filename%3DSystem_level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681116\u0026Signature=KX3LYkqNVWXGdt8V3TymY~Bo6WKcjDYfK-k6Uahja2Vd4xc~MaKuxWlXi-yiRCdnZJ3n6tFDbvdD8MkZOCfmhR-QcqawRvV4068NP0VaX7-uL4-a-k2PXZhr0Y9LWRvXZEDCKQel2AruubKYnk5yjKdYZFbcPx7rcbJ8zCQKzCg6Jbst4y8Nl8glFGMScVkm4hZSZDg4IdfIDuWFoCvHMJa-uan9ouz9DIiAZEZon9~ooEaTJOJ3QSNExXQbDxnnh-8QA4sQk50OO~1YK40AaQsrIOn-YDBaTrFSy3ru-4DklT8dG7Yqm3FkbpMUnJAXpoVZBtYMlMzqSpCEfET~dQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":442,"name":"Parallel Computing","url":"https://www.academia.edu/Documents/in/Parallel_Computing"},{"id":36300,"name":"Fault Tolerance","url":"https://www.academia.edu/Documents/in/Fault_Tolerance"},{"id":238655,"name":"Implementation","url":"https://www.academia.edu/Documents/in/Implementation"},{"id":377043,"name":"Scalability","url":"https://www.academia.edu/Documents/in/Scalability"},{"id":491492,"name":"InfiniBand","url":"https://www.academia.edu/Documents/in/InfiniBand"},{"id":2141217,"name":"Supercomputer","url":"https://www.academia.edu/Documents/in/Supercomputer"}],"urls":[{"id":41341945,"url":"http://arxiv.org/pdf/1607.07995"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-117958223-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="92504210"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/92504210/Extended_Batch_Sessions_and_Three_Phase_Debugging"><img alt="Research paper thumbnail of Extended Batch Sessions and Three-Phase Debugging" class="work-thumbnail" src="https://attachments.academia-assets.com/95496120/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/92504210/Extended_Batch_Sessions_and_Three_Phase_Debugging">Extended Batch Sessions and Three-Phase Debugging</a></div><div class="wp-workCard_item"><span>Proceedings of the XSEDE16 Conference on Diversity, Big Data, and Science at Scale</span><span>, 2016</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Batch environments are notoriously unfriendly because it&#39;s not easy to interactively diagnose the...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Batch environments are notoriously unfriendly because it&#39;s not easy to interactively diagnose the health of a job. A job may be terminated without warning when it reaches the end of an allotted runtime slot, or it may terminate even sooner due to an unsuspected bug that occurs only at large scale. Two strategies are proposed that take advantage of DMT-CP (Distributed MultiThreaded CheckPointing) for systemlevel checkpointing. First, we describe a three-phase debugging strategy that permits one to interactively debug long-running MPI applications that were developed for noninteractive batch environments. Second, we review how to use the SLURM resource manager capability to easily implement extended batch sessions that overcome the typical limitation of 24 hours maximum for a single batch job on large HPC resources. We argue for greater use of this lesser known capability, as a means to remove the necessity for the application-specific checkpointing found in many longrunning jobs. CCS Concepts •Software and its engineering → Checkpoint / restart; Software testing and debugging;</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="b85581bfed410b8ceec7424b0b1ca574" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:95496120,&quot;asset_id&quot;:92504210,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/95496120/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="92504210"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="92504210"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 92504210; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=92504210]").text(description); $(".js-view-count[data-work-id=92504210]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 92504210; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='92504210']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "b85581bfed410b8ceec7424b0b1ca574" } } $('.js-work-strip[data-work-id=92504210]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":92504210,"title":"Extended Batch Sessions and Three-Phase Debugging","translated_title":"","metadata":{"publisher":"ACM","grobid_abstract":"Batch environments are notoriously unfriendly because it's not easy to interactively diagnose the health of a job. A job may be terminated without warning when it reaches the end of an allotted runtime slot, or it may terminate even sooner due to an unsuspected bug that occurs only at large scale. Two strategies are proposed that take advantage of DMT-CP (Distributed MultiThreaded CheckPointing) for systemlevel checkpointing. First, we describe a three-phase debugging strategy that permits one to interactively debug long-running MPI applications that were developed for noninteractive batch environments. Second, we review how to use the SLURM resource manager capability to easily implement extended batch sessions that overcome the typical limitation of 24 hours maximum for a single batch job on large HPC resources. We argue for greater use of this lesser known capability, as a means to remove the necessity for the application-specific checkpointing found in many longrunning jobs. CCS Concepts •Software and its engineering → Checkpoint / restart; Software testing and debugging;","publication_date":{"day":null,"month":null,"year":2016,"errors":{}},"publication_name":"Proceedings of the XSEDE16 Conference on Diversity, Big Data, and Science at Scale","grobid_abstract_attachment_id":95496120},"translated_abstract":null,"internal_url":"https://www.academia.edu/92504210/Extended_Batch_Sessions_and_Three_Phase_Debugging","translated_internal_url":"","created_at":"2022-12-09T09:54:14.020-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":95496120,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496120/thumbnails/1.jpg","file_name":"2949550.pdf","download_url":"https://www.academia.edu/attachments/95496120/download_file","bulk_download_file_name":"Extended_Batch_Sessions_and_Three_Phase.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496120/2949550-libre.pdf?1670611737=\u0026response-content-disposition=attachment%3B+filename%3DExtended_Batch_Sessions_and_Three_Phase.pdf\u0026Expires=1743681116\u0026Signature=ANXxvFlML1ANNEFpS41RUFcumBmLgW46vb5HJ9By9LIVXYlJldh1DgHJbP7fQTRF017VJQMqj9k6sS4ULCnWsfdc-bRlbVtAc1D7toFJylrjSDvEYHSsCw~9DmoXsVoLRVp5eOnfgujaMTPmGlgWKbQdIAzBmPWGsWRIpdFFJxosyn5OnGG2D7Ih-LcT42BTSQo2IkimzJTZJ-peDmKVZYlTmplHAwADM0Gk7Zm3TsFyVbiaYcxBXt5dK~fKMVI7KksjJqKIFI30f62hvulZqRLpHU19DtN3EpFm2Ju5D~-jd1dY0DDa10jq0lyO2LlOMYx2l1dQ6YqXuUbtNp~Eqg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Extended_Batch_Sessions_and_Three_Phase_Debugging","translated_slug":"","page_count":8,"language":"en","content_type":"Work","summary":"Batch environments are notoriously unfriendly because it's not easy to interactively diagnose the health of a job. A job may be terminated without warning when it reaches the end of an allotted runtime slot, or it may terminate even sooner due to an unsuspected bug that occurs only at large scale. Two strategies are proposed that take advantage of DMT-CP (Distributed MultiThreaded CheckPointing) for systemlevel checkpointing. First, we describe a three-phase debugging strategy that permits one to interactively debug long-running MPI applications that were developed for noninteractive batch environments. Second, we review how to use the SLURM resource manager capability to easily implement extended batch sessions that overcome the typical limitation of 24 hours maximum for a single batch job on large HPC resources. We argue for greater use of this lesser known capability, as a means to remove the necessity for the application-specific checkpointing found in many longrunning jobs. CCS Concepts •Software and its engineering → Checkpoint / restart; Software testing and debugging;","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":95496120,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496120/thumbnails/1.jpg","file_name":"2949550.pdf","download_url":"https://www.academia.edu/attachments/95496120/download_file","bulk_download_file_name":"Extended_Batch_Sessions_and_Three_Phase.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496120/2949550-libre.pdf?1670611737=\u0026response-content-disposition=attachment%3B+filename%3DExtended_Batch_Sessions_and_Three_Phase.pdf\u0026Expires=1743681116\u0026Signature=ANXxvFlML1ANNEFpS41RUFcumBmLgW46vb5HJ9By9LIVXYlJldh1DgHJbP7fQTRF017VJQMqj9k6sS4ULCnWsfdc-bRlbVtAc1D7toFJylrjSDvEYHSsCw~9DmoXsVoLRVp5eOnfgujaMTPmGlgWKbQdIAzBmPWGsWRIpdFFJxosyn5OnGG2D7Ih-LcT42BTSQo2IkimzJTZJ-peDmKVZYlTmplHAwADM0Gk7Zm3TsFyVbiaYcxBXt5dK~fKMVI7KksjJqKIFI30f62hvulZqRLpHU19DtN3EpFm2Ju5D~-jd1dY0DDa10jq0lyO2LlOMYx2l1dQ6YqXuUbtNp~Eqg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":95496121,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496121/thumbnails/1.jpg","file_name":"2949550.pdf","download_url":"https://www.academia.edu/attachments/95496121/download_file","bulk_download_file_name":"Extended_Batch_Sessions_and_Three_Phase.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496121/2949550-libre.pdf?1670611736=\u0026response-content-disposition=attachment%3B+filename%3DExtended_Batch_Sessions_and_Three_Phase.pdf\u0026Expires=1743681116\u0026Signature=FlP7u4U~86SbzDGFHk~aRrus9XBMVgiIUBLgXobmm1P6Vyh6MVjLZrV7xnuWd~honimG-nosJ1AYMhsX8kmBU8QG-6oFAaA3jt1D-jv1q9fspKXVeTFR5dU8Njmi8PW8yWEHaYS1spzbadTsYA~0Mawk6Em-NM7BCzjQeNh~M7lymqcQfFr3ZuMtC1OT7Sq-Gf18flzgFctWSNiVffMAnAJnFR9sTGLkmkBJFR2SJ98ixFL2MLhJcD7JyAS~rWJ~y5FskvR5XtpGnmBASUkZGFJg7VKGfHU8mbjQpR9LeDwKvzhyvpgyWElxM-fsdJ-DhJulQJkQnpc4-CMDkj8ChQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":440,"name":"Distributed Computing","url":"https://www.academia.edu/Documents/in/Distributed_Computing"},{"id":568451,"name":"Batch Processing","url":"https://www.academia.edu/Documents/in/Batch_Processing"},{"id":879152,"name":"Debugging","url":"https://www.academia.edu/Documents/in/Debugging"}],"urls":[{"id":26834961,"url":"https://dl.acm.org/doi/pdf/10.1145/2949550.2949645"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-92504210-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="92504179"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/92504179/Memory_harvesting_VMs_in_cloud_platforms"><img alt="Research paper thumbnail of Memory-harvesting VMs in cloud platforms" class="work-thumbnail" src="https://attachments.academia-assets.com/95496086/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/92504179/Memory_harvesting_VMs_in_cloud_platforms">Memory-harvesting VMs in cloud platforms</a></div><div class="wp-workCard_item"><span>Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems</span><span>, 2022</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Cloud platforms monetize their spare capacity by renting &quot;Spot&quot; virtual machines (VMs) that can b...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Cloud platforms monetize their spare capacity by renting &quot;Spot&quot; virtual machines (VMs) that can be evicted in favor of higher-priority VMs. Recent work has shown that resource-harvesting VMs are more effective at exploiting spare capacity than Spot VMs, while also reducing the number of evictions. However, the prior work focused on harvesting CPU cores while keeping memory size fixed. This wastes a substantial monetization opportunity and may even limit the ability of harvesting VMs to leverage spare cores. Thus, in this paper, we explore memory harvesting and its challenges in real cloud platforms, namely its impact on VM creation time, NUMA spanning, and page fragmentation. We start by characterizing the amount and dynamics of the spare memory in Azure. We then design and implement memory-harvesting VMs (MHVMs), introducing new techniques for memory buffering, batching, and pre-reclamation. To demonstrate the use of MHVMs, we also extend a popular cluster scheduling framework (Hadoop) and a FaaS platform to adapt to them. Our main results show that (1) there is plenty of scope for memory harvesting in real platforms; (2) MHVMs are effective at mitigating the negative impacts of harvesting; and (3) our extensions of Hadoop and FaaS successfully hide the MHVMs&#39; varying memory size from the users&#39; data-processing jobs and functions. We conclude that memory harvesting has great potential for practical deployment and users can save up to 93% of their costs when running workloads on MHVMs. CCS CONCEPTS • Computer systems organization → Cloud computing.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="d7d751c6739209985055ccf14b465ef4" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:95496086,&quot;asset_id&quot;:92504179,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/95496086/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="92504179"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="92504179"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 92504179; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=92504179]").text(description); $(".js-view-count[data-work-id=92504179]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 92504179; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='92504179']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "d7d751c6739209985055ccf14b465ef4" } } $('.js-work-strip[data-work-id=92504179]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":92504179,"title":"Memory-harvesting VMs in cloud platforms","translated_title":"","metadata":{"publisher":"ACM","grobid_abstract":"Cloud platforms monetize their spare capacity by renting \"Spot\" virtual machines (VMs) that can be evicted in favor of higher-priority VMs. Recent work has shown that resource-harvesting VMs are more effective at exploiting spare capacity than Spot VMs, while also reducing the number of evictions. However, the prior work focused on harvesting CPU cores while keeping memory size fixed. This wastes a substantial monetization opportunity and may even limit the ability of harvesting VMs to leverage spare cores. Thus, in this paper, we explore memory harvesting and its challenges in real cloud platforms, namely its impact on VM creation time, NUMA spanning, and page fragmentation. We start by characterizing the amount and dynamics of the spare memory in Azure. We then design and implement memory-harvesting VMs (MHVMs), introducing new techniques for memory buffering, batching, and pre-reclamation. To demonstrate the use of MHVMs, we also extend a popular cluster scheduling framework (Hadoop) and a FaaS platform to adapt to them. Our main results show that (1) there is plenty of scope for memory harvesting in real platforms; (2) MHVMs are effective at mitigating the negative impacts of harvesting; and (3) our extensions of Hadoop and FaaS successfully hide the MHVMs' varying memory size from the users' data-processing jobs and functions. We conclude that memory harvesting has great potential for practical deployment and users can save up to 93% of their costs when running workloads on MHVMs. CCS CONCEPTS • Computer systems organization → Cloud computing.","publication_date":{"day":null,"month":null,"year":2022,"errors":{}},"publication_name":"Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","grobid_abstract_attachment_id":95496085},"translated_abstract":null,"internal_url":"https://www.academia.edu/92504179/Memory_harvesting_VMs_in_cloud_platforms","translated_internal_url":"","created_at":"2022-12-09T09:53:37.035-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":95496086,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496086/thumbnails/1.jpg","file_name":"3503222.pdf","download_url":"https://www.academia.edu/attachments/95496086/download_file","bulk_download_file_name":"Memory_harvesting_VMs_in_cloud_platforms.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496086/3503222-libre.pdf?1670611753=\u0026response-content-disposition=attachment%3B+filename%3DMemory_harvesting_VMs_in_cloud_platforms.pdf\u0026Expires=1743681117\u0026Signature=aFmj6vgPJHPA1hj0~AEUFE8U-jmR5qfiqp-TzEFgcBK0mk~YKGgntc9H6YzLtsCbHK5R~TVch-NSKIHX2Jg6pz3JI~xP9f-qntSvaSs77LiYiI4khMlCH99Pv1gSSSKdatPBt6hJSLWiPrEVypbrKPTOmI8QEKpBvYTCX0vn2hhNepRcWBA6WhOPwgmvAlXL5c3RqaVOwXdNkdmJnyRTU0WkJjtKkPRxFHe37StbnKV3LtBDH1~bV9U7meTjlnDWeSF-~aNBEQw79cndjKCHsnQ6NfKSE47OmJ-n3XusLxIVCxIGXRwU6w0pJ8rj94PcsQn-dCuYgJt49Cmf1ONmWg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Memory_harvesting_VMs_in_cloud_platforms","translated_slug":"","page_count":12,"language":"en","content_type":"Work","summary":"Cloud platforms monetize their spare capacity by renting \"Spot\" virtual machines (VMs) that can be evicted in favor of higher-priority VMs. Recent work has shown that resource-harvesting VMs are more effective at exploiting spare capacity than Spot VMs, while also reducing the number of evictions. However, the prior work focused on harvesting CPU cores while keeping memory size fixed. This wastes a substantial monetization opportunity and may even limit the ability of harvesting VMs to leverage spare cores. Thus, in this paper, we explore memory harvesting and its challenges in real cloud platforms, namely its impact on VM creation time, NUMA spanning, and page fragmentation. We start by characterizing the amount and dynamics of the spare memory in Azure. We then design and implement memory-harvesting VMs (MHVMs), introducing new techniques for memory buffering, batching, and pre-reclamation. To demonstrate the use of MHVMs, we also extend a popular cluster scheduling framework (Hadoop) and a FaaS platform to adapt to them. Our main results show that (1) there is plenty of scope for memory harvesting in real platforms; (2) MHVMs are effective at mitigating the negative impacts of harvesting; and (3) our extensions of Hadoop and FaaS successfully hide the MHVMs' varying memory size from the users' data-processing jobs and functions. We conclude that memory harvesting has great potential for practical deployment and users can save up to 93% of their costs when running workloads on MHVMs. CCS CONCEPTS • Computer systems organization → Cloud computing.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":95496086,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496086/thumbnails/1.jpg","file_name":"3503222.pdf","download_url":"https://www.academia.edu/attachments/95496086/download_file","bulk_download_file_name":"Memory_harvesting_VMs_in_cloud_platforms.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496086/3503222-libre.pdf?1670611753=\u0026response-content-disposition=attachment%3B+filename%3DMemory_harvesting_VMs_in_cloud_platforms.pdf\u0026Expires=1743681117\u0026Signature=aFmj6vgPJHPA1hj0~AEUFE8U-jmR5qfiqp-TzEFgcBK0mk~YKGgntc9H6YzLtsCbHK5R~TVch-NSKIHX2Jg6pz3JI~xP9f-qntSvaSs77LiYiI4khMlCH99Pv1gSSSKdatPBt6hJSLWiPrEVypbrKPTOmI8QEKpBvYTCX0vn2hhNepRcWBA6WhOPwgmvAlXL5c3RqaVOwXdNkdmJnyRTU0WkJjtKkPRxFHe37StbnKV3LtBDH1~bV9U7meTjlnDWeSF-~aNBEQw79cndjKCHsnQ6NfKSE47OmJ-n3XusLxIVCxIGXRwU6w0pJ8rj94PcsQn-dCuYgJt49Cmf1ONmWg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":95496085,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496085/thumbnails/1.jpg","file_name":"3503222.pdf","download_url":"https://www.academia.edu/attachments/95496085/download_file","bulk_download_file_name":"Memory_harvesting_VMs_in_cloud_platforms.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496085/3503222-libre.pdf?1670611765=\u0026response-content-disposition=attachment%3B+filename%3DMemory_harvesting_VMs_in_cloud_platforms.pdf\u0026Expires=1743681117\u0026Signature=A5rUIm0pn2fxsi7W4oqIFh67BiI8qo6SkF097hRQyBfXSxhq0j1Q4-Wh4RDsLlB2-KMi6tZL9lNEDfzaOqqcqVe7hORy2nP1g8n3CwWQ6atqN1egrln2syalz7t2i9GD4KqCxSSPlYuObLZ1PNpkdWs1eajdVfsuT~36SKoKZyT002h5tKEFNXGjgse~zxiqRdA2dyi4pES38VTlzvXUn85qUxTIrvBdrC2gR0rDXh-STx4EsDLX6sLKt2QJq-emaIhTX8EXidpB9PpWcHVh0-Q7wRXO6zIZ77a1XqqEI6ZaFeWOa7vmjexGh~K2V2o9IWuErENSLteC~IRDJXYk8Q__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":26860,"name":"Cloud Computing","url":"https://www.academia.edu/Documents/in/Cloud_Computing"}],"urls":[{"id":26834941,"url":"https://dl.acm.org/doi/pdf/10.1145/3503222.3507725"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-92504179-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="82956344"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/82956344/2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CONF_SCIPY_2013_DMTCP_Bringing_Checkpoint_Restart_to_Python"><img alt="Research paper thumbnail of 2 PROC. OF THE 12th PYTHON IN SCIENCE CONF. (SCIPY 2013) DMTCP: Bringing Checkpoint-Restart to Python" class="work-thumbnail" src="https://attachments.academia-assets.com/88480680/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/82956344/2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CONF_SCIPY_2013_DMTCP_Bringing_Checkpoint_Restart_to_Python">2 PROC. OF THE 12th PYTHON IN SCIENCE CONF. (SCIPY 2013) DMTCP: Bringing Checkpoint-Restart to Python</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. ...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. It operates in user-space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython &amp;quot;from the outside &amp;quot; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be re-quested interactively within a Python session, or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process), and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using VNC; (ii) a Fast/Slow technique to use multiple hosts or cores to check one Cython computation in parallel; and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint f...</span></div><div class="wp-workCard_item"><div class="carousel-container carousel-container--sm" id="profile-work-82956344-figures"><div class="prev-slide-container js-prev-button-container"><button aria-label="Previous" class="carousel-navigation-button js-profile-work-82956344-figures-prev"><span class="material-symbols-outlined" style="font-size: 24px" translate="no">arrow_back_ios</span></button></div><div class="slides-container js-slides-container"><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455763/figure-5-to-checkpoint-image-file-each-process-has-its-own"><img alt="to a checkpoint image file. Each process has its own checkpoint image. Prior to checkpoint, each plugin will have copied into user-space memory any kernel state associated with its concerns. Examples of such concerns include network sockets, files, and pseudo-terminals. Once the checkpoint image has been created, the checkpoint thread &quot;un-quiesces&quot; the user threads and they resume executing application code. " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_005.jpg" /></a></figure><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455733/figure-1-fast-cython-with-slow-cpython-checking-nodes"><img alt="Fig. 1: Fast Cython with Slow CPython &quot;checking&quot; nodes. " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_001.jpg" /></a></figure><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455743/figure-2-to-execute-the-undo-command-the-debugging-session"><img alt="To execute the UNDO command, the debugging session is restarted from the checkpoint image, and the debugging com- mands are automatically re-executed from the list excluding the last command. This takes the process back to before the debugger command was issued. " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_002.jpg" /></a></figure><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455749/figure-4-reverse-expression-watchpoint-command-into-series"><img alt="Fig. 4: Reverse Expression Watchpoint. command into a series of commands terminating with step is non-trivial, and an algorithm for that decomposition is presented in [Visan1l1] . " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_003.jpg" /></a></figure><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455758/figure-5-architecture-of-dmtcp"><img alt="Fig. 5: Architecture of DMTCP. " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_004.jpg" /></a></figure></div><div class="next-slide-container js-next-button-container"><button aria-label="Next" class="carousel-navigation-button js-profile-work-82956344-figures-next"><span class="material-symbols-outlined" style="font-size: 24px" translate="no">arrow_forward_ios</span></button></div></div></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="d98fb89f03b46ab4e5d41dbf21d5985a" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:88480680,&quot;asset_id&quot;:82956344,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/88480680/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="82956344"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="82956344"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 82956344; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=82956344]").text(description); $(".js-view-count[data-work-id=82956344]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 82956344; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='82956344']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "d98fb89f03b46ab4e5d41dbf21d5985a" } } $('.js-work-strip[data-work-id=82956344]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":82956344,"title":"2 PROC. OF THE 12th PYTHON IN SCIENCE CONF. (SCIPY 2013) DMTCP: Bringing Checkpoint-Restart to Python","translated_title":"","metadata":{"abstract":"Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. It operates in user-space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026quot;from the outside \u0026quot; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be re-quested interactively within a Python session, or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process), and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using VNC; (ii) a Fast/Slow technique to use multiple hosts or cores to check one Cython computation in parallel; and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint f...","publication_date":{"day":null,"month":null,"year":2016,"errors":{}}},"translated_abstract":"Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. It operates in user-space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026quot;from the outside \u0026quot; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be re-quested interactively within a Python session, or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process), and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using VNC; (ii) a Fast/Slow technique to use multiple hosts or cores to check one Cython computation in parallel; and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint f...","internal_url":"https://www.academia.edu/82956344/2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CONF_SCIPY_2013_DMTCP_Bringing_Checkpoint_Restart_to_Python","translated_internal_url":"","created_at":"2022-07-11T08:22:33.099-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":88480680,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/88480680/thumbnails/1.jpg","file_name":"arya.pdf","download_url":"https://www.academia.edu/attachments/88480680/download_file","bulk_download_file_name":"2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CON.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/88480680/arya-libre.pdf?1657584648=\u0026response-content-disposition=attachment%3B+filename%3D2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CON.pdf\u0026Expires=1743681117\u0026Signature=gMJ-thrxMQVjbyyVpIkLnhQJ8T2BdKDFqQUctIqQAvlq8AGua99mpDN12K9aZG4rNIon340pWFXF9IbXNl2v6j20oW1vbQ4McHPSdCXLGq8j39KgtZrDFWyMdPJUKTyxS2KNpcHN8JMIOPfugPyZe5TPDFM9smKaUvtvEDOMrkxU2R9JsWZbBbu46YmeiDVbYpRIh~hF8b976cVhmlcg9u4-59Yylk2igSG~36wSOxu5w5FrY40TNPrIJ4CXa5KXanMik4vD44y3jRqlwQqO~Yor010MRoB0AlgV-0S6RlVLQMXGZrhwkLZvmAh7TGWz9DW4GLtOU6Ypzk8KaUWfrA__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CONF_SCIPY_2013_DMTCP_Bringing_Checkpoint_Restart_to_Python","translated_slug":"","page_count":6,"language":"en","content_type":"Work","summary":"Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. It operates in user-space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026quot;from the outside \u0026quot; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be re-quested interactively within a Python session, or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process), and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using VNC; (ii) a Fast/Slow technique to use multiple hosts or cores to check one Cython computation in parallel; and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint f...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":88480680,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/88480680/thumbnails/1.jpg","file_name":"arya.pdf","download_url":"https://www.academia.edu/attachments/88480680/download_file","bulk_download_file_name":"2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CON.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/88480680/arya-libre.pdf?1657584648=\u0026response-content-disposition=attachment%3B+filename%3D2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CON.pdf\u0026Expires=1743681117\u0026Signature=gMJ-thrxMQVjbyyVpIkLnhQJ8T2BdKDFqQUctIqQAvlq8AGua99mpDN12K9aZG4rNIon340pWFXF9IbXNl2v6j20oW1vbQ4McHPSdCXLGq8j39KgtZrDFWyMdPJUKTyxS2KNpcHN8JMIOPfugPyZe5TPDFM9smKaUvtvEDOMrkxU2R9JsWZbBbu46YmeiDVbYpRIh~hF8b976cVhmlcg9u4-59Yylk2igSG~36wSOxu5w5FrY40TNPrIJ4CXa5KXanMik4vD44y3jRqlwQqO~Yor010MRoB0AlgV-0S6RlVLQMXGZrhwkLZvmAh7TGWz9DW4GLtOU6Ypzk8KaUWfrA__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":1145043,"name":"Ipython","url":"https://www.academia.edu/Documents/in/Ipython"}],"urls":[{"id":22083991,"url":"http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.897.6773\u0026rep=rep1\u0026type=pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (true) { Aedu.setUpFigureCarousel('profile-work-82956344-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="78073660"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/78073660/System_Level_Scalable_Checkpoint_Restart_for_Petascale_Computing"><img alt="Research paper thumbnail of System-Level Scalable Checkpoint-Restart for Petascale Computing" class="work-thumbnail" src="https://attachments.academia-assets.com/85247513/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/78073660/System_Level_Scalable_Checkpoint_Restart_for_Petascale_Computing">System-Level Scalable Checkpoint-Restart for Petascale Computing</a></div><div class="wp-workCard_item"><span>2016 IEEE 22nd International Conference on Parallel and Distributed Systems (ICPADS)</span><span>, 2016</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Fault tolerance for the upcoming exascale generation has long been an area of active research. On...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="35d201cc48af1fa7bfe1fac603ba450c" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:85247513,&quot;asset_id&quot;:78073660,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/85247513/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="78073660"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="78073660"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 78073660; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=78073660]").text(description); $(".js-view-count[data-work-id=78073660]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 78073660; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='78073660']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "35d201cc48af1fa7bfe1fac603ba450c" } } $('.js-work-strip[data-work-id=78073660]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":78073660,"title":"System-Level Scalable Checkpoint-Restart for Petascale Computing","translated_title":"","metadata":{"publisher":"IEEE","grobid_abstract":"Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.","publication_date":{"day":null,"month":null,"year":2016,"errors":{}},"publication_name":"2016 IEEE 22nd International Conference on Parallel and Distributed Systems (ICPADS)","grobid_abstract_attachment_id":85247513},"translated_abstract":null,"internal_url":"https://www.academia.edu/78073660/System_Level_Scalable_Checkpoint_Restart_for_Petascale_Computing","translated_internal_url":"","created_at":"2022-04-30T14:18:38.786-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":85247513,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247513/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/85247513/download_file","bulk_download_file_name":"System_Level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247513/1607-libre.pdf?1651354198=\u0026response-content-disposition=attachment%3B+filename%3DSystem_Level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681117\u0026Signature=WD91O8CN-hjkpgvAYOAycMpMKE0p5YhKu0vm53rd~LSMpECPWI~PUvgOEinCm49UXr57ws6ndeCvM~wzwR2IOPuNwYkIBhde6ZhvR9ApG-3U3sXipExrSF-~dbHEWyhnp7CIk6HwJE2mB4U-fdg9GowtVbnRObcFU~k3i37twtQLgarl2y4MEzGPcx9Y1xnVutgnCWnAEFkU7oaa87li72LZjCSh7G1ORe14PsLZlOQqHuHDOj73gIDaQrfbhHhwrR6wn9P~ubu-bJIOyasYIT2QfXV1ycGnc6iv1~w97jp3CCnXLFq0M9bFeDuOw9RRn19-oEHWjnBtKTwdtsliZg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"System_Level_Scalable_Checkpoint_Restart_for_Petascale_Computing","translated_slug":"","page_count":18,"language":"en","content_type":"Work","summary":"Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":85247513,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247513/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/85247513/download_file","bulk_download_file_name":"System_Level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247513/1607-libre.pdf?1651354198=\u0026response-content-disposition=attachment%3B+filename%3DSystem_Level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681117\u0026Signature=WD91O8CN-hjkpgvAYOAycMpMKE0p5YhKu0vm53rd~LSMpECPWI~PUvgOEinCm49UXr57ws6ndeCvM~wzwR2IOPuNwYkIBhde6ZhvR9ApG-3U3sXipExrSF-~dbHEWyhnp7CIk6HwJE2mB4U-fdg9GowtVbnRObcFU~k3i37twtQLgarl2y4MEzGPcx9Y1xnVutgnCWnAEFkU7oaa87li72LZjCSh7G1ORe14PsLZlOQqHuHDOj73gIDaQrfbhHhwrR6wn9P~ubu-bJIOyasYIT2QfXV1ycGnc6iv1~w97jp3CCnXLFq0M9bFeDuOw9RRn19-oEHWjnBtKTwdtsliZg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":111436,"name":"IEEE","url":"https://www.academia.edu/Documents/in/IEEE"}],"urls":[{"id":20075549,"url":"http://xplorestaging.ieee.org/ielx7/7822825/7823715/07823840.pdf?arnumber=7823840"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-78073660-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="78073659"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/78073659/Transition_Watchpoints_Teaching_Old_Debuggers_New_Tricks"><img alt="Research paper thumbnail of Transition Watchpoints: Teaching Old Debuggers New Tricks" class="work-thumbnail" src="https://attachments.academia-assets.com/85247512/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/78073659/Transition_Watchpoints_Teaching_Old_Debuggers_New_Tricks">Transition Watchpoints: Teaching Old Debuggers New Tricks</a></div><div class="wp-workCard_item"><span>The Art, Science, and Engineering of Programming</span><span>, 2017</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Reversible debuggers and process replay have been developed at least since. This vision enables o...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Reversible debuggers and process replay have been developed at least since. This vision enables one to execute backwards in time under a debugger. Two important problems in practice are that, first, current reversible debuggers are slow when reversing over long time periods, and, second, after building one reversible debugger, it is difficult to transfer that achievement to a new programming environment. The user observes a bug when arriving at an error. Searching backwards for the corresponding fault may require many reverse steps. Ultimately, the user prefers to write an expression that will transition to false upon arriving at the fault. The solution is an expression-transition watchpoint facility based on top of snapshots and record/replay. Expression-transition watchpoints are implemented as binary search through the timeline of a program execution, while using the snapshots as landmarks within that timeline. This allows for debugging of subtle bugs that appear only after minutes or more of program execution. When a bug occurs within seconds of program startup, repeated debugging sessions suffice. Reversible debugging is preferred for bugs seen only after minutes. This architecture allows for an efficient and easy-to-write snapshot-based reversible debugger on top of a conventional debugger. The validity of this approach was tested by developing four personalities (for GDB, MATLAB, Perl, and Python), with each personality typically requiring just lines of code.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="152f7f157eddb106f82970ed95fdd4f1" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:85247512,&quot;asset_id&quot;:78073659,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/85247512/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="78073659"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="78073659"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 78073659; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=78073659]").text(description); $(".js-view-count[data-work-id=78073659]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 78073659; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='78073659']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "152f7f157eddb106f82970ed95fdd4f1" } } $('.js-work-strip[data-work-id=78073659]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":78073659,"title":"Transition Watchpoints: Teaching Old Debuggers New Tricks","translated_title":"","metadata":{"publisher":"Aspect-Oriented Software Association (AOSA)","ai_title_tag":"Efficient Reversible Debugging with Expression-Transition Watchpoints","grobid_abstract":"Reversible debuggers and process replay have been developed at least since. This vision enables one to execute backwards in time under a debugger. Two important problems in practice are that, first, current reversible debuggers are slow when reversing over long time periods, and, second, after building one reversible debugger, it is difficult to transfer that achievement to a new programming environment. The user observes a bug when arriving at an error. Searching backwards for the corresponding fault may require many reverse steps. Ultimately, the user prefers to write an expression that will transition to false upon arriving at the fault. The solution is an expression-transition watchpoint facility based on top of snapshots and record/replay. Expression-transition watchpoints are implemented as binary search through the timeline of a program execution, while using the snapshots as landmarks within that timeline. This allows for debugging of subtle bugs that appear only after minutes or more of program execution. When a bug occurs within seconds of program startup, repeated debugging sessions suffice. Reversible debugging is preferred for bugs seen only after minutes. This architecture allows for an efficient and easy-to-write snapshot-based reversible debugger on top of a conventional debugger. The validity of this approach was tested by developing four personalities (for GDB, MATLAB, Perl, and Python), with each personality typically requiring just lines of code.","publication_date":{"day":null,"month":null,"year":2017,"errors":{}},"publication_name":"The Art, Science, and Engineering of Programming","grobid_abstract_attachment_id":85247512},"translated_abstract":null,"internal_url":"https://www.academia.edu/78073659/Transition_Watchpoints_Teaching_Old_Debuggers_New_Tricks","translated_internal_url":"","created_at":"2022-04-30T14:18:38.628-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":85247512,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247512/thumbnails/1.jpg","file_name":"1703.pdf","download_url":"https://www.academia.edu/attachments/85247512/download_file","bulk_download_file_name":"Transition_Watchpoints_Teaching_Old_Debu.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247512/1703-libre.pdf?1651354205=\u0026response-content-disposition=attachment%3B+filename%3DTransition_Watchpoints_Teaching_Old_Debu.pdf\u0026Expires=1743681117\u0026Signature=G4n3fIejtEjJigm90O2TlD0lHllfQQrl~z41r8ckokiyRQvqehYiacpKPK9HNXmLXN~DkfeVWyDMFIKpLD34fNCkB4MdQsvOhknnYah0~~Ui--~i8hjla36Xs4YYDX5WWSSQKNzQQ4CFgzFqbW09sPApvWqhHQ-5cHRaJGGH3twJIjO7ZoEekBsGFbhDvZf8Thn7Y~v-nW54VXjeKxz075XEY~ODl19XKhOaFDbl82K~reU4QIfSszkGyUhhVf2pNcgk6xcaLXyiLsA-R1utx2wBwCPwWsmtL1VoJNg1qQYBvLJR4aujGjV9mq4~d7vLecRyKnu9ZN36-vrvJaH2bQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Transition_Watchpoints_Teaching_Old_Debuggers_New_Tricks","translated_slug":"","page_count":28,"language":"en","content_type":"Work","summary":"Reversible debuggers and process replay have been developed at least since. This vision enables one to execute backwards in time under a debugger. Two important problems in practice are that, first, current reversible debuggers are slow when reversing over long time periods, and, second, after building one reversible debugger, it is difficult to transfer that achievement to a new programming environment. The user observes a bug when arriving at an error. Searching backwards for the corresponding fault may require many reverse steps. Ultimately, the user prefers to write an expression that will transition to false upon arriving at the fault. The solution is an expression-transition watchpoint facility based on top of snapshots and record/replay. Expression-transition watchpoints are implemented as binary search through the timeline of a program execution, while using the snapshots as landmarks within that timeline. This allows for debugging of subtle bugs that appear only after minutes or more of program execution. When a bug occurs within seconds of program startup, repeated debugging sessions suffice. Reversible debugging is preferred for bugs seen only after minutes. This architecture allows for an efficient and easy-to-write snapshot-based reversible debugger on top of a conventional debugger. The validity of this approach was tested by developing four personalities (for GDB, MATLAB, Perl, and Python), with each personality typically requiring just lines of code.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":85247512,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247512/thumbnails/1.jpg","file_name":"1703.pdf","download_url":"https://www.academia.edu/attachments/85247512/download_file","bulk_download_file_name":"Transition_Watchpoints_Teaching_Old_Debu.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247512/1703-libre.pdf?1651354205=\u0026response-content-disposition=attachment%3B+filename%3DTransition_Watchpoints_Teaching_Old_Debu.pdf\u0026Expires=1743681117\u0026Signature=G4n3fIejtEjJigm90O2TlD0lHllfQQrl~z41r8ckokiyRQvqehYiacpKPK9HNXmLXN~DkfeVWyDMFIKpLD34fNCkB4MdQsvOhknnYah0~~Ui--~i8hjla36Xs4YYDX5WWSSQKNzQQ4CFgzFqbW09sPApvWqhHQ-5cHRaJGGH3twJIjO7ZoEekBsGFbhDvZf8Thn7Y~v-nW54VXjeKxz075XEY~ODl19XKhOaFDbl82K~reU4QIfSszkGyUhhVf2pNcgk6xcaLXyiLsA-R1utx2wBwCPwWsmtL1VoJNg1qQYBvLJR4aujGjV9mq4~d7vLecRyKnu9ZN36-vrvJaH2bQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-78073659-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="78073646"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/78073646/Towards_Fault_Tolerant_Energy_Efficient_High_Performance_Computing_in_the_Cloud"><img alt="Research paper thumbnail of Towards Fault-Tolerant Energy-Efficient High Performance Computing in the Cloud" class="work-thumbnail" src="https://attachments.academia-assets.com/85247494/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/78073646/Towards_Fault_Tolerant_Energy_Efficient_High_Performance_Computing_in_the_Cloud">Towards Fault-Tolerant Energy-Efficient High Performance Computing in the Cloud</a></div><div class="wp-workCard_item"><span>2012 IEEE International Conference on Cluster Computing</span><span>, 2012</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">In cluster computing, power and cooling represent a significant cost compared to the hardware its...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">In cluster computing, power and cooling represent a significant cost compared to the hardware itself. This is of special concern in the cloud, which provides access to large numbers of computers. We examine the use of ARM-based clusters for low-power, high performance computing. This work examines two likely use-modes: (i) a standard dedicated cluster; and (ii) a cluster of pre-configured virtual machines in the cloud. A 40-node department-level cluster based on an ARM Cortex-A9 is compared against a similar cluster based on an Intel Core 2 Duo, in contrast to a recent similar study on just a 4-node cluster. For the NAS benchmarks on 32node clusters, ARM was found to have a power efficiency ranging from 1.3 to 6.2 times greater than that of Intel. This is despite Intel&#39;s approximately five times greater performance. The particular efficiency ratio depends primarily on the size of the working set relative to L2 cache. In addition to energyefficient computing, this study also emphasizes fault tolerance: an important ingredient in high performance computing. It relies on two recent extensions to the DMTCP checkpointrestart package. DMTCP was extended (i) to support ARM CPUs, and (ii) to support checkpointing of the Qemu virtual machine in user-mode. DMTCP is used both to checkpoint native distributed applications, and to checkpoint a network of virtual machines. This latter case demonstrates the ability to deploy pre-configured software in virtual machines hosted in the cloud, and further to migrate cluster computation between hosts in the cloud.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="44075ce150cb8622a8967578e423cc3d" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:85247494,&quot;asset_id&quot;:78073646,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/85247494/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="78073646"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="78073646"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 78073646; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=78073646]").text(description); $(".js-view-count[data-work-id=78073646]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 78073646; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='78073646']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "44075ce150cb8622a8967578e423cc3d" } } $('.js-work-strip[data-work-id=78073646]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":78073646,"title":"Towards Fault-Tolerant Energy-Efficient High Performance Computing in the Cloud","translated_title":"","metadata":{"grobid_abstract":"In cluster computing, power and cooling represent a significant cost compared to the hardware itself. This is of special concern in the cloud, which provides access to large numbers of computers. We examine the use of ARM-based clusters for low-power, high performance computing. This work examines two likely use-modes: (i) a standard dedicated cluster; and (ii) a cluster of pre-configured virtual machines in the cloud. A 40-node department-level cluster based on an ARM Cortex-A9 is compared against a similar cluster based on an Intel Core 2 Duo, in contrast to a recent similar study on just a 4-node cluster. For the NAS benchmarks on 32node clusters, ARM was found to have a power efficiency ranging from 1.3 to 6.2 times greater than that of Intel. This is despite Intel's approximately five times greater performance. The particular efficiency ratio depends primarily on the size of the working set relative to L2 cache. In addition to energyefficient computing, this study also emphasizes fault tolerance: an important ingredient in high performance computing. It relies on two recent extensions to the DMTCP checkpointrestart package. DMTCP was extended (i) to support ARM CPUs, and (ii) to support checkpointing of the Qemu virtual machine in user-mode. DMTCP is used both to checkpoint native distributed applications, and to checkpoint a network of virtual machines. This latter case demonstrates the ability to deploy pre-configured software in virtual machines hosted in the cloud, and further to migrate cluster computation between hosts in the cloud.","publication_date":{"day":null,"month":null,"year":2012,"errors":{}},"publication_name":"2012 IEEE International Conference on Cluster Computing","grobid_abstract_attachment_id":85247494},"translated_abstract":null,"internal_url":"https://www.academia.edu/78073646/Towards_Fault_Tolerant_Energy_Efficient_High_Performance_Computing_in_the_Cloud","translated_internal_url":"","created_at":"2022-04-30T14:17:53.894-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":85247494,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247494/thumbnails/1.jpg","file_name":"cluster12b.pdf","download_url":"https://www.academia.edu/attachments/85247494/download_file","bulk_download_file_name":"Towards_Fault_Tolerant_Energy_Efficient.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247494/cluster12b-libre.pdf?1651354197=\u0026response-content-disposition=attachment%3B+filename%3DTowards_Fault_Tolerant_Energy_Efficient.pdf\u0026Expires=1743681117\u0026Signature=MRrn2b9CP9SXUXzUcnnF7Xj70WdT4cMdVERr6eILsNIH0YF0uelPMS2cK3BWUPucxd8apwfqWXPbBSNJVGinjeg6X6zVO4DRJpcRZ2ugCmokj2-xMTcxzQdolq2suQFOuss-uAotXLYD3IjxZFtWvYuejZRApwgQ4KFsYsEYfEgoL2Fp3RZO0JvugFJKgkir3cgwuBLgWGEZi6nOctHe7X3toUOydJSsP1A8gPogiXAhKH~W8lVzwEGmgbHBzdc00nWbsKVaVfvhYPt4gP-T8kF-fZmDU9ajI6k6DxrRHJKGJ1hQW-BJ5JFCY7aDjA1remlafB0cQIYnc8l~ens9MA__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Towards_Fault_Tolerant_Energy_Efficient_High_Performance_Computing_in_the_Cloud","translated_slug":"","page_count":5,"language":"en","content_type":"Work","summary":"In cluster computing, power and cooling represent a significant cost compared to the hardware itself. This is of special concern in the cloud, which provides access to large numbers of computers. We examine the use of ARM-based clusters for low-power, high performance computing. This work examines two likely use-modes: (i) a standard dedicated cluster; and (ii) a cluster of pre-configured virtual machines in the cloud. A 40-node department-level cluster based on an ARM Cortex-A9 is compared against a similar cluster based on an Intel Core 2 Duo, in contrast to a recent similar study on just a 4-node cluster. For the NAS benchmarks on 32node clusters, ARM was found to have a power efficiency ranging from 1.3 to 6.2 times greater than that of Intel. This is despite Intel's approximately five times greater performance. The particular efficiency ratio depends primarily on the size of the working set relative to L2 cache. In addition to energyefficient computing, this study also emphasizes fault tolerance: an important ingredient in high performance computing. It relies on two recent extensions to the DMTCP checkpointrestart package. DMTCP was extended (i) to support ARM CPUs, and (ii) to support checkpointing of the Qemu virtual machine in user-mode. DMTCP is used both to checkpoint native distributed applications, and to checkpoint a network of virtual machines. This latter case demonstrates the ability to deploy pre-configured software in virtual machines hosted in the cloud, and further to migrate cluster computation between hosts in the cloud.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":85247494,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247494/thumbnails/1.jpg","file_name":"cluster12b.pdf","download_url":"https://www.academia.edu/attachments/85247494/download_file","bulk_download_file_name":"Towards_Fault_Tolerant_Energy_Efficient.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247494/cluster12b-libre.pdf?1651354197=\u0026response-content-disposition=attachment%3B+filename%3DTowards_Fault_Tolerant_Energy_Efficient.pdf\u0026Expires=1743681117\u0026Signature=MRrn2b9CP9SXUXzUcnnF7Xj70WdT4cMdVERr6eILsNIH0YF0uelPMS2cK3BWUPucxd8apwfqWXPbBSNJVGinjeg6X6zVO4DRJpcRZ2ugCmokj2-xMTcxzQdolq2suQFOuss-uAotXLYD3IjxZFtWvYuejZRApwgQ4KFsYsEYfEgoL2Fp3RZO0JvugFJKgkir3cgwuBLgWGEZi6nOctHe7X3toUOydJSsP1A8gPogiXAhKH~W8lVzwEGmgbHBzdc00nWbsKVaVfvhYPt4gP-T8kF-fZmDU9ajI6k6DxrRHJKGJ1hQW-BJ5JFCY7aDjA1remlafB0cQIYnc8l~ens9MA__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":15835,"name":"Virtual Machines","url":"https://www.academia.edu/Documents/in/Virtual_Machines"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-78073646-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877693"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877693/FReD_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime"><img alt="Research paper thumbnail of FReD: Automated Debugging via Binary Search through a Process Lifetime" class="work-thumbnail" src="https://attachments.academia-assets.com/80443370/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877693/FReD_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime">FReD: Automated Debugging via Binary Search through a Process Lifetime</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Reversible debuggers have been developed at least since 1970. Such a feature is useful when the c...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Reversible debuggers have been developed at least since 1970. Such a feature is useful when the cause of a bug is close in time to the bug manifestation. When the cause is far back in time, one resorts to setting appropriate breakpoints in the debugger and beginning a new debugging session. For these cases when the cause of a bug is far in time from its manifestation, bug diagnosis requires a series of debugging sessions with which to narrow down the cause of the bug. For such &amp;quot;difficult&amp;quot; bugs, this work presents an automated tool to search through the process lifetime and locate the cause. As an example, the bug could be related to a program invariant failing. A binary search through the process lifetime suffices, since the invariant expression is true at the beginning of the program execution, and false when the bug is encountered. An algorithm for such a binary search is presented within the FReD (Fast Reversible Debugger) software. It is based on the ability to checkpo...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="2917e625a4e7940aa6bdff8ab186f63f" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443370,&quot;asset_id&quot;:70877693,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443370/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877693"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877693"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877693; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877693]").text(description); $(".js-view-count[data-work-id=70877693]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877693; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877693']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "2917e625a4e7940aa6bdff8ab186f63f" } } $('.js-work-strip[data-work-id=70877693]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877693,"title":"FReD: Automated Debugging via Binary Search through a Process Lifetime","translated_title":"","metadata":{"abstract":"Reversible debuggers have been developed at least since 1970. Such a feature is useful when the cause of a bug is close in time to the bug manifestation. When the cause is far back in time, one resorts to setting appropriate breakpoints in the debugger and beginning a new debugging session. For these cases when the cause of a bug is far in time from its manifestation, bug diagnosis requires a series of debugging sessions with which to narrow down the cause of the bug. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool to search through the process lifetime and locate the cause. As an example, the bug could be related to a program invariant failing. A binary search through the process lifetime suffices, since the invariant expression is true at the beginning of the program execution, and false when the bug is encountered. An algorithm for such a binary search is presented within the FReD (Fast Reversible Debugger) software. It is based on the ability to checkpo...","publication_date":{"day":20,"month":12,"year":2012,"errors":{}}},"translated_abstract":"Reversible debuggers have been developed at least since 1970. Such a feature is useful when the cause of a bug is close in time to the bug manifestation. When the cause is far back in time, one resorts to setting appropriate breakpoints in the debugger and beginning a new debugging session. For these cases when the cause of a bug is far in time from its manifestation, bug diagnosis requires a series of debugging sessions with which to narrow down the cause of the bug. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool to search through the process lifetime and locate the cause. As an example, the bug could be related to a program invariant failing. A binary search through the process lifetime suffices, since the invariant expression is true at the beginning of the program execution, and false when the bug is encountered. An algorithm for such a binary search is presented within the FReD (Fast Reversible Debugger) software. It is based on the ability to checkpo...","internal_url":"https://www.academia.edu/70877693/FReD_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime","translated_internal_url":"","created_at":"2022-02-07T15:23:25.825-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443370,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443370/thumbnails/1.jpg","file_name":"1212.5204.pdf","download_url":"https://www.academia.edu/attachments/80443370/download_file","bulk_download_file_name":"FReD_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443370/1212.5204-libre.pdf?1644277584=\u0026response-content-disposition=attachment%3B+filename%3DFReD_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=PT1loahvwy4k8UlEJJnLFmo-z0CzYxK~x7G7WGRqm~3~2H1Ip6dCfQPtYkHCeEcThS1DtNwsQ6rOrDV6IJuizqbXfg-X4qnFlNsfeYtCjkP7KgSmNoDdV~xtsUT3FNvwatU8P89M94rgSU-AizehqwkG0Ke4IfdRiiK4ZkiHhqG8S3HK7HMCZU-HUv4R1OIIvYnuSecI9CyTSeYhfgtjEnaSuClSjLGdWNNflyXQNO3ZET27gxaBK6Ehuq2uIom4nwdgvZ7Whjw8JXHq9taPsJw~DYL-rg7zJop0UUJf0oyUhlfv9VpUezY7XkHDDPqDWEpnFGnaQvcar9AzZ0ulrw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"FReD_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime","translated_slug":"","page_count":21,"language":"en","content_type":"Work","summary":"Reversible debuggers have been developed at least since 1970. Such a feature is useful when the cause of a bug is close in time to the bug manifestation. When the cause is far back in time, one resorts to setting appropriate breakpoints in the debugger and beginning a new debugging session. For these cases when the cause of a bug is far in time from its manifestation, bug diagnosis requires a series of debugging sessions with which to narrow down the cause of the bug. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool to search through the process lifetime and locate the cause. As an example, the bug could be related to a program invariant failing. A binary search through the process lifetime suffices, since the invariant expression is true at the beginning of the program execution, and false when the bug is encountered. An algorithm for such a binary search is presented within the FReD (Fast Reversible Debugger) software. It is based on the ability to checkpo...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443370,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443370/thumbnails/1.jpg","file_name":"1212.5204.pdf","download_url":"https://www.academia.edu/attachments/80443370/download_file","bulk_download_file_name":"FReD_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443370/1212.5204-libre.pdf?1644277584=\u0026response-content-disposition=attachment%3B+filename%3DFReD_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=PT1loahvwy4k8UlEJJnLFmo-z0CzYxK~x7G7WGRqm~3~2H1Ip6dCfQPtYkHCeEcThS1DtNwsQ6rOrDV6IJuizqbXfg-X4qnFlNsfeYtCjkP7KgSmNoDdV~xtsUT3FNvwatU8P89M94rgSU-AizehqwkG0Ke4IfdRiiK4ZkiHhqG8S3HK7HMCZU-HUv4R1OIIvYnuSecI9CyTSeYhfgtjEnaSuClSjLGdWNNflyXQNO3ZET27gxaBK6Ehuq2uIom4nwdgvZ7Whjw8JXHq9taPsJw~DYL-rg7zJop0UUJf0oyUhlfv9VpUezY7XkHDDPqDWEpnFGnaQvcar9AzZ0ulrw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":80443369,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443369/thumbnails/1.jpg","file_name":"1212.5204.pdf","download_url":"https://www.academia.edu/attachments/80443369/download_file","bulk_download_file_name":"FReD_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443369/1212.5204-libre.pdf?1644277584=\u0026response-content-disposition=attachment%3B+filename%3DFReD_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=brt~LufuX0HSRcLILgScBnimosnr18ozzG-RAVJy2QQEVvIY05rPCoRA7H06yM~582LKeHdDF-4LN2cklO1~N-GRDYBGlf84J64YZvpM07gY2Lnec-z9FbGZLeyKmxQcKNWVtllT0LM15VmzriVO0faNeBOCk03akIPUJTcj-e2jgrJhCavvq2uFJX~lCRSgIQkCaPHr3ZAG2mjuSmDx8bxBIM0fkdYEOKcV2qtRF4CwtHKp1sd3E1iqQo96gU9xeT9fDXHXc9nMUQWyxBBSD1-7SV7~NactxmlgBk2gYRSmr5Chp0ikqjJelijG6Sqh1jXdsMm4NTqGB3pUHSD0Vg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[],"urls":[{"id":17437732,"url":"https://archive.org/download/arxiv-1212.5204/1212.5204.pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877693-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877692"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877692/DMTCP_Transparent_checkpointing_for_cluster_computations_and_the_desktop"><img alt="Research paper thumbnail of DMTCP: Transparent checkpointing for cluster computations and the desktop" class="work-thumbnail" src="https://attachments.academia-assets.com/80443430/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877692/DMTCP_Transparent_checkpointing_for_cluster_computations_and_the_desktop">DMTCP: Transparent checkpointing for cluster computations and the desktop</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package for distributed applications. Checkpointing and restart is demonstrated for a wide range of over 20 well known applications, including MATLAB, Python, TightVNC, MPICH2, OpenMPI, and runCMS. RunCMS runs as a 680 MB image in memory that includes 540 dynamic libraries, and is used for the CMS experiment of the Large Hadron Collider at CERN. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads; as well as typical desktop applications. On 128 distributed cores (32 nodes), checkpoint and restart times are typically 2 seconds, with negligible run-time overhead. Typical checkpoint times are reduced to 0.2 seconds when using forked checkpointing. Experimental results show that checkpoint time remains nearly constant as the number of nodes increases on a medium-size cluster. DMTCP automatically accounts for fork, exec, ssh, mutexes/semaphor...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="44b1cee6e183759b133d3f811d2c3501" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443430,&quot;asset_id&quot;:70877692,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443430/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877692"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877692"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877692; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877692]").text(description); $(".js-view-count[data-work-id=70877692]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877692; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877692']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "44b1cee6e183759b133d3f811d2c3501" } } $('.js-work-strip[data-work-id=70877692]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877692,"title":"DMTCP: Transparent checkpointing for cluster computations and the desktop","translated_title":"","metadata":{"abstract":"DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package for distributed applications. Checkpointing and restart is demonstrated for a wide range of over 20 well known applications, including MATLAB, Python, TightVNC, MPICH2, OpenMPI, and runCMS. RunCMS runs as a 680 MB image in memory that includes 540 dynamic libraries, and is used for the CMS experiment of the Large Hadron Collider at CERN. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads; as well as typical desktop applications. On 128 distributed cores (32 nodes), checkpoint and restart times are typically 2 seconds, with negligible run-time overhead. Typical checkpoint times are reduced to 0.2 seconds when using forked checkpointing. Experimental results show that checkpoint time remains nearly constant as the number of nodes increases on a medium-size cluster. DMTCP automatically accounts for fork, exec, ssh, mutexes/semaphor...","publication_date":{"day":null,"month":null,"year":2009,"errors":{}}},"translated_abstract":"DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package for distributed applications. Checkpointing and restart is demonstrated for a wide range of over 20 well known applications, including MATLAB, Python, TightVNC, MPICH2, OpenMPI, and runCMS. RunCMS runs as a 680 MB image in memory that includes 540 dynamic libraries, and is used for the CMS experiment of the Large Hadron Collider at CERN. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads; as well as typical desktop applications. On 128 distributed cores (32 nodes), checkpoint and restart times are typically 2 seconds, with negligible run-time overhead. Typical checkpoint times are reduced to 0.2 seconds when using forked checkpointing. Experimental results show that checkpoint time remains nearly constant as the number of nodes increases on a medium-size cluster. DMTCP automatically accounts for fork, exec, ssh, mutexes/semaphor...","internal_url":"https://www.academia.edu/70877692/DMTCP_Transparent_checkpointing_for_cluster_computations_and_the_desktop","translated_internal_url":"","created_at":"2022-02-07T15:23:25.643-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443430,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443430/thumbnails/1.jpg","file_name":"ipdps09.pdf","download_url":"https://www.academia.edu/attachments/80443430/download_file","bulk_download_file_name":"DMTCP_Transparent_checkpointing_for_clus.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443430/ipdps09-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DDMTCP_Transparent_checkpointing_for_clus.pdf\u0026Expires=1743681117\u0026Signature=C03bLppfm9j52RgJ0XnNMSE7sPqLQxUq-GX4FucMgTj98WpnZjoasEHTflsMtKJJ3sB01VJ7~GiYf05ZH782I5FxWh6Y8VSyLZaiF2X4pstB4QkS9rfTOwvSilJffF-pWCUDZuuShqh6QepiJy9BB1GmE75od8UgAn1lQtoCP3KBjRyJ-jMohmxITRKxbDebeem2dl~jwntKfNxRXRBo0gy37egubT30FTycpSO6LwRweLf2l1qD91RyJDoKaveIvLqBoKGVImBteJ1kTLuAh0gxBWQwFxkynmKRZZOUZPkX8OdadkpfwF9OiU7CPYxzL9uUB~CTxJv1co5E~vQK5A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"DMTCP_Transparent_checkpointing_for_cluster_computations_and_the_desktop","translated_slug":"","page_count":12,"language":"en","content_type":"Work","summary":"DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package for distributed applications. Checkpointing and restart is demonstrated for a wide range of over 20 well known applications, including MATLAB, Python, TightVNC, MPICH2, OpenMPI, and runCMS. RunCMS runs as a 680 MB image in memory that includes 540 dynamic libraries, and is used for the CMS experiment of the Large Hadron Collider at CERN. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads; as well as typical desktop applications. On 128 distributed cores (32 nodes), checkpoint and restart times are typically 2 seconds, with negligible run-time overhead. Typical checkpoint times are reduced to 0.2 seconds when using forked checkpointing. Experimental results show that checkpoint time remains nearly constant as the number of nodes increases on a medium-size cluster. DMTCP automatically accounts for fork, exec, ssh, mutexes/semaphor...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443430,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443430/thumbnails/1.jpg","file_name":"ipdps09.pdf","download_url":"https://www.academia.edu/attachments/80443430/download_file","bulk_download_file_name":"DMTCP_Transparent_checkpointing_for_clus.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443430/ipdps09-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DDMTCP_Transparent_checkpointing_for_clus.pdf\u0026Expires=1743681117\u0026Signature=C03bLppfm9j52RgJ0XnNMSE7sPqLQxUq-GX4FucMgTj98WpnZjoasEHTflsMtKJJ3sB01VJ7~GiYf05ZH782I5FxWh6Y8VSyLZaiF2X4pstB4QkS9rfTOwvSilJffF-pWCUDZuuShqh6QepiJy9BB1GmE75od8UgAn1lQtoCP3KBjRyJ-jMohmxITRKxbDebeem2dl~jwntKfNxRXRBo0gy37egubT30FTycpSO6LwRweLf2l1qD91RyJDoKaveIvLqBoKGVImBteJ1kTLuAh0gxBWQwFxkynmKRZZOUZPkX8OdadkpfwF9OiU7CPYxzL9uUB~CTxJv1co5E~vQK5A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":5021,"name":"Packaging","url":"https://www.academia.edu/Documents/in/Packaging"},{"id":34740,"name":"Cluster Computing","url":"https://www.academia.edu/Documents/in/Cluster_Computing"},{"id":44244,"name":"OPERATING SYSTEM","url":"https://www.academia.edu/Documents/in/OPERATING_SYSTEM"},{"id":67584,"name":"Large Hadron Collider","url":"https://www.academia.edu/Documents/in/Large_Hadron_Collider"},{"id":80870,"name":"Parallel \u0026 Distributed Computing","url":"https://www.academia.edu/Documents/in/Parallel_and_Distributed_Computing"},{"id":97733,"name":"Shared memory","url":"https://www.academia.edu/Documents/in/Shared_memory"},{"id":191487,"name":"Kernel","url":"https://www.academia.edu/Documents/in/Kernel"}],"urls":[{"id":17437731,"url":"http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.207.7693\u0026rep=rep1\u0026type=pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877692-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877691"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877691/Semi_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime"><img alt="Research paper thumbnail of Semi-Automated Debugging via Binary Search through a Process Lifetime" class="work-thumbnail" src="https://attachments.academia-assets.com/80443431/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877691/Semi_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime">Semi-Automated Debugging via Binary Search through a Process Lifetime</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">A common programmer experience is to execute a long-running computation only to see a bug crash t...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">A common programmer experience is to execute a long-running computation only to see a bug crash the program after hours or days. While it is often easy to capture a &amp;quot;buggy&amp;quot; expression value at the point of the crash, it is less easy to discover the point in the program where the expression became buggy. For such &amp;quot;difficult&amp;quot; bugs, this work presents an automated tool based on binary search through a process lifetime. The tool operates both in singlethreaded and multi-threaded program. The underlying algorithm depends on on checkpoints, deterministic replay, and decomposition of debugging histories. The tool is scalable in the sense that the running time is a small constant factor beyond the standalone running time. Further, it requires only a logarithmic number of probes of the expression value -an advantage when the time to execute the expression is large. The algorithm is demonstrated for such realworld programs as MySQL.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="834a2dcec8422e0d028df02587fad5ea" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443431,&quot;asset_id&quot;:70877691,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443431/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877691"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877691"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877691; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877691]").text(description); $(".js-view-count[data-work-id=70877691]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877691; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877691']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "834a2dcec8422e0d028df02587fad5ea" } } $('.js-work-strip[data-work-id=70877691]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877691,"title":"Semi-Automated Debugging via Binary Search through a Process Lifetime","translated_title":"","metadata":{"abstract":"A common programmer experience is to execute a long-running computation only to see a bug crash the program after hours or days. While it is often easy to capture a \u0026quot;buggy\u0026quot; expression value at the point of the crash, it is less easy to discover the point in the program where the expression became buggy. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool based on binary search through a process lifetime. The tool operates both in singlethreaded and multi-threaded program. The underlying algorithm depends on on checkpoints, deterministic replay, and decomposition of debugging histories. The tool is scalable in the sense that the running time is a small constant factor beyond the standalone running time. Further, it requires only a logarithmic number of probes of the expression value -an advantage when the time to execute the expression is large. The algorithm is demonstrated for such realworld programs as MySQL.","publication_date":{"day":null,"month":null,"year":2020,"errors":{}}},"translated_abstract":"A common programmer experience is to execute a long-running computation only to see a bug crash the program after hours or days. While it is often easy to capture a \u0026quot;buggy\u0026quot; expression value at the point of the crash, it is less easy to discover the point in the program where the expression became buggy. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool based on binary search through a process lifetime. The tool operates both in singlethreaded and multi-threaded program. The underlying algorithm depends on on checkpoints, deterministic replay, and decomposition of debugging histories. The tool is scalable in the sense that the running time is a small constant factor beyond the standalone running time. Further, it requires only a logarithmic number of probes of the expression value -an advantage when the time to execute the expression is large. The algorithm is demonstrated for such realworld programs as MySQL.","internal_url":"https://www.academia.edu/70877691/Semi_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime","translated_internal_url":"","created_at":"2022-02-07T15:23:25.464-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443431,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443431/thumbnails/1.jpg","file_name":"1212.pdf","download_url":"https://www.academia.edu/attachments/80443431/download_file","bulk_download_file_name":"Semi_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443431/1212-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DSemi_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=bUhb7yoiQiMSxeUhfadqiss2eYraiGP3RUyIUIm3I~ao7qbwe-yrJkmJ7Jh7Jcg9vphiekYgB85d9NFQ7p0sVpvVzoK1y7WgKAxetM4AMx2BcsmZY8L6vAYdJ~jQchOe33-XOpKQ~Iywb~3vNCtjGUjvpRthmAidC82Vi-PLiEJCbJOVeAp5J5sLW~3-BBL2ThgmvS6XzMk363GbSzSwBk6HMigRQxXosU6eeu26RFKhTa3tofYwI5KB7QKhIcjkuxzu1EDdfcEC55O-5K0rAESZnI1mA8BJzawURn6gRM3bDQZbwBgjPx1fvwhm0YyGsIGo4a6oZV0aqi4ccPAC3Q__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Semi_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime","translated_slug":"","page_count":21,"language":"en","content_type":"Work","summary":"A common programmer experience is to execute a long-running computation only to see a bug crash the program after hours or days. While it is often easy to capture a \u0026quot;buggy\u0026quot; expression value at the point of the crash, it is less easy to discover the point in the program where the expression became buggy. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool based on binary search through a process lifetime. The tool operates both in singlethreaded and multi-threaded program. The underlying algorithm depends on on checkpoints, deterministic replay, and decomposition of debugging histories. The tool is scalable in the sense that the running time is a small constant factor beyond the standalone running time. Further, it requires only a logarithmic number of probes of the expression value -an advantage when the time to execute the expression is large. The algorithm is demonstrated for such realworld programs as MySQL.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443431,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443431/thumbnails/1.jpg","file_name":"1212.pdf","download_url":"https://www.academia.edu/attachments/80443431/download_file","bulk_download_file_name":"Semi_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443431/1212-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DSemi_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=bUhb7yoiQiMSxeUhfadqiss2eYraiGP3RUyIUIm3I~ao7qbwe-yrJkmJ7Jh7Jcg9vphiekYgB85d9NFQ7p0sVpvVzoK1y7WgKAxetM4AMx2BcsmZY8L6vAYdJ~jQchOe33-XOpKQ~Iywb~3vNCtjGUjvpRthmAidC82Vi-PLiEJCbJOVeAp5J5sLW~3-BBL2ThgmvS6XzMk363GbSzSwBk6HMigRQxXosU6eeu26RFKhTa3tofYwI5KB7QKhIcjkuxzu1EDdfcEC55O-5K0rAESZnI1mA8BJzawURn6gRM3bDQZbwBgjPx1fvwhm0YyGsIGo4a6oZV0aqi4ccPAC3Q__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":8137,"name":"Unix","url":"https://www.academia.edu/Documents/in/Unix"},{"id":43254,"name":"Composition","url":"https://www.academia.edu/Documents/in/Composition"},{"id":99915,"name":"Integration","url":"https://www.academia.edu/Documents/in/Integration"},{"id":2722261,"name":"Metasystem","url":"https://www.academia.edu/Documents/in/Metasystem"},{"id":3193313,"name":"arXiv","url":"https://www.academia.edu/Documents/in/arXiv"}],"urls":[{"id":17437730,"url":"http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1062.8803\u0026rep=rep1\u0026type=pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877691-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877690"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877690/Design_and_Implementation_for_Checkpointing_of_Distributed_Resources_Using_Process_Level_Virtualization"><img alt="Research paper thumbnail of Design and Implementation for Checkpointing of Distributed Resources Using Process-Level Virtualization" class="work-thumbnail" src="https://attachments.academia-assets.com/80443428/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877690/Design_and_Implementation_for_Checkpointing_of_Distributed_Resources_Using_Process_Level_Virtualization">Design and Implementation for Checkpointing of Distributed Resources Using Process-Level Virtualization</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">System-level checkpoint-restart is a critical technology for long-running jobs in high-performanc...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">System-level checkpoint-restart is a critical technology for long-running jobs in high-performance computing. Yet, only two approaches to checkpointing MPI applications continue to survive in wide use today. One approach is to use the kernel module-based BLCR in combination with an MPI checkpoint-restart service particular to the MPI implementation in use. Unfortunately, this lacks support for some important Linux system services such as SysV IPC (e.g., shared memory objects). A second approach has been to use the original 2009 DMTCP implementation (herein referred to as DMTCP-09) for transparent, system-level checkpointing. Unfortunately, DMTCP-09 lacked support for checkpointing many of the necessary features found by MPI in a modern batch environment. These include: ssh, the InfiniBand network, process migration (restarting an MPI application on different cluster nodes), and modified file path prefixes on restart (typically due to a changing current directory, mount points, libra...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="c9810bcec5364be316e8e9cd33603a2a" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443428,&quot;asset_id&quot;:70877690,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443428/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877690"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877690"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877690; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877690]").text(description); $(".js-view-count[data-work-id=70877690]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877690; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877690']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "c9810bcec5364be316e8e9cd33603a2a" } } $('.js-work-strip[data-work-id=70877690]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877690,"title":"Design and Implementation for Checkpointing of Distributed Resources Using Process-Level Virtualization","translated_title":"","metadata":{"abstract":"System-level checkpoint-restart is a critical technology for long-running jobs in high-performance computing. Yet, only two approaches to checkpointing MPI applications continue to survive in wide use today. One approach is to use the kernel module-based BLCR in combination with an MPI checkpoint-restart service particular to the MPI implementation in use. Unfortunately, this lacks support for some important Linux system services such as SysV IPC (e.g., shared memory objects). A second approach has been to use the original 2009 DMTCP implementation (herein referred to as DMTCP-09) for transparent, system-level checkpointing. Unfortunately, DMTCP-09 lacked support for checkpointing many of the necessary features found by MPI in a modern batch environment. These include: ssh, the InfiniBand network, process migration (restarting an MPI application on different cluster nodes), and modified file path prefixes on restart (typically due to a changing current directory, mount points, libra...","publisher":"2016 IEEE International Conference on Cluster Computing (CLUSTER)","publication_date":{"day":null,"month":null,"year":2016,"errors":{}}},"translated_abstract":"System-level checkpoint-restart is a critical technology for long-running jobs in high-performance computing. Yet, only two approaches to checkpointing MPI applications continue to survive in wide use today. One approach is to use the kernel module-based BLCR in combination with an MPI checkpoint-restart service particular to the MPI implementation in use. Unfortunately, this lacks support for some important Linux system services such as SysV IPC (e.g., shared memory objects). A second approach has been to use the original 2009 DMTCP implementation (herein referred to as DMTCP-09) for transparent, system-level checkpointing. Unfortunately, DMTCP-09 lacked support for checkpointing many of the necessary features found by MPI in a modern batch environment. These include: ssh, the InfiniBand network, process migration (restarting an MPI application on different cluster nodes), and modified file path prefixes on restart (typically due to a changing current directory, mount points, libra...","internal_url":"https://www.academia.edu/70877690/Design_and_Implementation_for_Checkpointing_of_Distributed_Resources_Using_Process_Level_Virtualization","translated_internal_url":"","created_at":"2022-02-07T15:23:25.227-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443428,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443428/thumbnails/1.jpg","file_name":"cluster16.pdf","download_url":"https://www.academia.edu/attachments/80443428/download_file","bulk_download_file_name":"Design_and_Implementation_for_Checkpoint.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443428/cluster16-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DDesign_and_Implementation_for_Checkpoint.pdf\u0026Expires=1743681117\u0026Signature=FIArRc0yQAb-ahw50oM7gX-5z0X2Jy4bXL6D-qUq06G4WU8NmSdm-1Us19d2vWvpC5MPvwiYOJT2LJSGFAmKIZfNRnO-Wpg804h1DY0qK-2d~sjI8Gvl2yK0Kemm4USxwjyU~IHevzvgZpcp-3GN7c3M9pZiR-gIqLqo~R3A1GmjN0OIl1-uLYAY4g~x667QsM8sla-460lWaBXhU8EmXh5w91KebeRvSp322-g~XTxj7vY1hQLA7eqoSbUXrMAuwiE0hs98bWACSCPWbI4yvJKGKnDmWREo8kmo5VkInuT2YaR21D7MZg1c58~Z6YKR5XJUcT~vMF8F7DRsGMdAEQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Design_and_Implementation_for_Checkpointing_of_Distributed_Resources_Using_Process_Level_Virtualization","translated_slug":"","page_count":11,"language":"en","content_type":"Work","summary":"System-level checkpoint-restart is a critical technology for long-running jobs in high-performance computing. Yet, only two approaches to checkpointing MPI applications continue to survive in wide use today. One approach is to use the kernel module-based BLCR in combination with an MPI checkpoint-restart service particular to the MPI implementation in use. Unfortunately, this lacks support for some important Linux system services such as SysV IPC (e.g., shared memory objects). A second approach has been to use the original 2009 DMTCP implementation (herein referred to as DMTCP-09) for transparent, system-level checkpointing. Unfortunately, DMTCP-09 lacked support for checkpointing many of the necessary features found by MPI in a modern batch environment. These include: ssh, the InfiniBand network, process migration (restarting an MPI application on different cluster nodes), and modified file path prefixes on restart (typically due to a changing current directory, mount points, libra...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443428,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443428/thumbnails/1.jpg","file_name":"cluster16.pdf","download_url":"https://www.academia.edu/attachments/80443428/download_file","bulk_download_file_name":"Design_and_Implementation_for_Checkpoint.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443428/cluster16-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DDesign_and_Implementation_for_Checkpoint.pdf\u0026Expires=1743681117\u0026Signature=FIArRc0yQAb-ahw50oM7gX-5z0X2Jy4bXL6D-qUq06G4WU8NmSdm-1Us19d2vWvpC5MPvwiYOJT2LJSGFAmKIZfNRnO-Wpg804h1DY0qK-2d~sjI8Gvl2yK0Kemm4USxwjyU~IHevzvgZpcp-3GN7c3M9pZiR-gIqLqo~R3A1GmjN0OIl1-uLYAY4g~x667QsM8sla-460lWaBXhU8EmXh5w91KebeRvSp322-g~XTxj7vY1hQLA7eqoSbUXrMAuwiE0hs98bWACSCPWbI4yvJKGKnDmWREo8kmo5VkInuT2YaR21D7MZg1c58~Z6YKR5XJUcT~vMF8F7DRsGMdAEQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[],"urls":[{"id":17437729,"url":"http://doi.ieeecomputersociety.org/10.1109/CLUSTER.2016.55"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877690-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877689"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877689/User_space_process_virtualization_in_the_context_of_checkpoint_restart_and_virtual_machines"><img alt="Research paper thumbnail of User-space process virtualization in the context of checkpoint-restart and virtual machines" class="work-thumbnail" src="https://attachments.academia-assets.com/80443429/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877689/User_space_process_virtualization_in_the_context_of_checkpoint_restart_and_virtual_machines">User-space process virtualization in the context of checkpoint-restart and virtual machines</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Checkpoint-Restart is the ability to save a set of running processes to a checkpoint image on dis...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Checkpoint-Restart is the ability to save a set of running processes to a checkpoint image on disk, and to later restart them from the disk. In addition to its traditional use in fault tolerance, recovering from a system failure, it has numerous other uses, such as for application debugging and save/restore of the workspace of an interactive problem-solving environment. Transparent checkpointing operates without modifying the underlying application program, but it implicitly relies on a &quot;Closed World Assumption&quot;-the world (including file system, network, etc.) will look the same upon restart as it did at the time of checkpoint. This is not valid for more complex programs. Until now, checkpoint-restart packages have adopted ad hoc solutions for each case where the environment changes upon restart. This dissertation presents user-space process virtualization to decouple application processes from the external subsystems. A thin virtualization layer is introduced between the application and each external subsystem. It provides the application with a consistent view of the external world and allows for checkpoint-restart to succeed. The ever growing number of external subsystems make it harder to deploy and maintain virtualization layers in a monolithic checkpoint-restart system. To address this, an adaptive plugin based approach is used to implement the virtualization layers that allow the checkpoint-restart system to grow organically. The principle of decoupling the external subsystem through process virtualization is also applied in the context of virtual machines for providing a solution to the long standing double-paging problem. Double-paging occurs when the guest attempts to page out memory that has previously been swapped out by the hypervisor and leads to long delays for the guest as the contents are read back into machine memory only to be written out again. The performance rapidly drops as a result of significant lengthening of the time to complete the guest I/O request.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="655cc617adb2884f1740f0faa11c4c32" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443429,&quot;asset_id&quot;:70877689,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443429/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877689"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877689"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877689; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877689]").text(description); $(".js-view-count[data-work-id=70877689]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877689; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877689']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "655cc617adb2884f1740f0faa11c4c32" } } $('.js-work-strip[data-work-id=70877689]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877689,"title":"User-space process virtualization in the context of checkpoint-restart and virtual machines","translated_title":"","metadata":{"grobid_abstract":"Checkpoint-Restart is the ability to save a set of running processes to a checkpoint image on disk, and to later restart them from the disk. In addition to its traditional use in fault tolerance, recovering from a system failure, it has numerous other uses, such as for application debugging and save/restore of the workspace of an interactive problem-solving environment. Transparent checkpointing operates without modifying the underlying application program, but it implicitly relies on a \"Closed World Assumption\"-the world (including file system, network, etc.) will look the same upon restart as it did at the time of checkpoint. This is not valid for more complex programs. Until now, checkpoint-restart packages have adopted ad hoc solutions for each case where the environment changes upon restart. This dissertation presents user-space process virtualization to decouple application processes from the external subsystems. A thin virtualization layer is introduced between the application and each external subsystem. It provides the application with a consistent view of the external world and allows for checkpoint-restart to succeed. The ever growing number of external subsystems make it harder to deploy and maintain virtualization layers in a monolithic checkpoint-restart system. To address this, an adaptive plugin based approach is used to implement the virtualization layers that allow the checkpoint-restart system to grow organically. The principle of decoupling the external subsystem through process virtualization is also applied in the context of virtual machines for providing a solution to the long standing double-paging problem. Double-paging occurs when the guest attempts to page out memory that has previously been swapped out by the hypervisor and leads to long delays for the guest as the contents are read back into machine memory only to be written out again. The performance rapidly drops as a result of significant lengthening of the time to complete the guest I/O request.","publication_date":{"day":null,"month":null,"year":2014,"errors":{}},"grobid_abstract_attachment_id":80443429},"translated_abstract":null,"internal_url":"https://www.academia.edu/70877689/User_space_process_virtualization_in_the_context_of_checkpoint_restart_and_virtual_machines","translated_internal_url":"","created_at":"2022-02-07T15:23:24.846-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443429,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443429/thumbnails/1.jpg","file_name":"fulltext.pdf","download_url":"https://www.academia.edu/attachments/80443429/download_file","bulk_download_file_name":"User_space_process_virtualization_in_the.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443429/fulltext-libre.pdf?1644277580=\u0026response-content-disposition=attachment%3B+filename%3DUser_space_process_virtualization_in_the.pdf\u0026Expires=1743681117\u0026Signature=RC5uScnK16zpQqQnJ0yMQNXgSMhL85cqnKy4W3~iMOT8Z5d0DY~dmyYys4LqoigEfliVOmeRcPrgVZ9HuuoPUNktQRKBIHZIpc9m87B179xxKz-nzN6ltXNzEPeqqVaN44IDwal10ZU0eT1~f7jcT7z6gqU2nG9GyM0CauFkwcBwEXCOWJ5xOzBw2dSKvPUuyp3pCdWymdL9dKvfjAPOpGsgmDjUP9DsXgGOItBvF2F~OFg3rKVh~z3SBHBYtuDIi8tQLOWiXw2S3Oc8-~OPOBBE5qheVoz2BAkgYtlLyXGjYftVM1qMFY-BBUPUuUVjLsPZ-xJmgoJGidc2wvYDug__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"User_space_process_virtualization_in_the_context_of_checkpoint_restart_and_virtual_machines","translated_slug":"","page_count":200,"language":"en","content_type":"Work","summary":"Checkpoint-Restart is the ability to save a set of running processes to a checkpoint image on disk, and to later restart them from the disk. In addition to its traditional use in fault tolerance, recovering from a system failure, it has numerous other uses, such as for application debugging and save/restore of the workspace of an interactive problem-solving environment. Transparent checkpointing operates without modifying the underlying application program, but it implicitly relies on a \"Closed World Assumption\"-the world (including file system, network, etc.) will look the same upon restart as it did at the time of checkpoint. This is not valid for more complex programs. Until now, checkpoint-restart packages have adopted ad hoc solutions for each case where the environment changes upon restart. This dissertation presents user-space process virtualization to decouple application processes from the external subsystems. A thin virtualization layer is introduced between the application and each external subsystem. It provides the application with a consistent view of the external world and allows for checkpoint-restart to succeed. The ever growing number of external subsystems make it harder to deploy and maintain virtualization layers in a monolithic checkpoint-restart system. To address this, an adaptive plugin based approach is used to implement the virtualization layers that allow the checkpoint-restart system to grow organically. The principle of decoupling the external subsystem through process virtualization is also applied in the context of virtual machines for providing a solution to the long standing double-paging problem. Double-paging occurs when the guest attempts to page out memory that has previously been swapped out by the hypervisor and leads to long delays for the guest as the contents are read back into machine memory only to be written out again. The performance rapidly drops as a result of significant lengthening of the time to complete the guest I/O request.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443429,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443429/thumbnails/1.jpg","file_name":"fulltext.pdf","download_url":"https://www.academia.edu/attachments/80443429/download_file","bulk_download_file_name":"User_space_process_virtualization_in_the.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443429/fulltext-libre.pdf?1644277580=\u0026response-content-disposition=attachment%3B+filename%3DUser_space_process_virtualization_in_the.pdf\u0026Expires=1743681117\u0026Signature=RC5uScnK16zpQqQnJ0yMQNXgSMhL85cqnKy4W3~iMOT8Z5d0DY~dmyYys4LqoigEfliVOmeRcPrgVZ9HuuoPUNktQRKBIHZIpc9m87B179xxKz-nzN6ltXNzEPeqqVaN44IDwal10ZU0eT1~f7jcT7z6gqU2nG9GyM0CauFkwcBwEXCOWJ5xOzBw2dSKvPUuyp3pCdWymdL9dKvfjAPOpGsgmDjUP9DsXgGOItBvF2F~OFg3rKVh~z3SBHBYtuDIi8tQLOWiXw2S3Oc8-~OPOBBE5qheVoz2BAkgYtlLyXGjYftVM1qMFY-BBUPUuUVjLsPZ-xJmgoJGidc2wvYDug__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":440,"name":"Distributed Computing","url":"https://www.academia.edu/Documents/in/Distributed_Computing"},{"id":15835,"name":"Virtual Machines","url":"https://www.academia.edu/Documents/in/Virtual_Machines"},{"id":36300,"name":"Fault Tolerance","url":"https://www.academia.edu/Documents/in/Fault_Tolerance"},{"id":327659,"name":"Paging","url":"https://www.academia.edu/Documents/in/Paging"}],"urls":[{"id":17437728,"url":"http://iris.lib.neu.edu/cgi/viewcontent.cgi?article=1042\u0026context=comp_sci_diss"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877689-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877688"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" rel="nofollow" href="https://www.academia.edu/70877688/Detecting_and_Suppressing_Redundant_Input_Output_Operations"><img alt="Research paper thumbnail of Detecting and Suppressing Redundant Input-Output Operations" class="work-thumbnail" src="https://a.academia-assets.com/images/blank-paper.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title">Detecting and Suppressing Redundant Input-Output Operations</div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877688"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877688"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877688; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877688]").text(description); $(".js-view-count[data-work-id=70877688]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877688; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877688']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (false){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "-1" } } $('.js-work-strip[data-work-id=70877688]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877688,"title":"Detecting and Suppressing Redundant Input-Output Operations","translated_title":"","metadata":{"publication_date":{"day":20,"month":12,"year":2012,"errors":{}}},"translated_abstract":null,"internal_url":"https://www.academia.edu/70877688/Detecting_and_Suppressing_Redundant_Input_Output_Operations","translated_internal_url":"","created_at":"2022-02-07T15:23:24.713-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[],"slug":"Detecting_and_Suppressing_Redundant_Input_Output_Operations","translated_slug":"","page_count":null,"language":"en","content_type":"Work","summary":null,"owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[],"research_interests":[],"urls":[{"id":17437727,"url":"http://www.freepatentsonline.com/y2012/0324181.html"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877688-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877687"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" rel="nofollow" href="https://www.academia.edu/70877687/DMTCP_bringing_interactive_checkpoint_restart_to_Python"><img alt="Research paper thumbnail of DMTCP: bringing interactive checkpoint–restart to Python" class="work-thumbnail" src="https://a.academia-assets.com/images/blank-paper.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title">DMTCP: bringing interactive checkpoint–restart to Python</div><div class="wp-workCard_item"><span>Computational Science &amp; Discovery</span><span>, 2015</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operat...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operates in user space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython &amp;#39;from the outside&amp;#39; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be requested interactively within a Python session or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process) and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using virtual network client, (ii) a fast/slow technique to use multiple hosts or cores to check one (Cython Behnel S et al 2011 Comput. Sci. Eng. 13 31–39) computation in parallel, and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint feature for locating the cause of a bug.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877687"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877687"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877687; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877687]").text(description); $(".js-view-count[data-work-id=70877687]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877687; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877687']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (false){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "-1" } } $('.js-work-strip[data-work-id=70877687]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877687,"title":"DMTCP: bringing interactive checkpoint–restart to Python","translated_title":"","metadata":{"abstract":"DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operates in user space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026#39;from the outside\u0026#39; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be requested interactively within a Python session or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process) and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using virtual network client, (ii) a fast/slow technique to use multiple hosts or cores to check one (Cython Behnel S et al 2011 Comput. Sci. Eng. 13 31–39) computation in parallel, and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint feature for locating the cause of a bug.","publisher":"IOP Publishing","publication_date":{"day":null,"month":null,"year":2015,"errors":{}},"publication_name":"Computational Science \u0026 Discovery"},"translated_abstract":"DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operates in user space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026#39;from the outside\u0026#39; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be requested interactively within a Python session or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process) and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using virtual network client, (ii) a fast/slow technique to use multiple hosts or cores to check one (Cython Behnel S et al 2011 Comput. Sci. Eng. 13 31–39) computation in parallel, and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint feature for locating the cause of a bug.","internal_url":"https://www.academia.edu/70877687/DMTCP_bringing_interactive_checkpoint_restart_to_Python","translated_internal_url":"","created_at":"2022-02-07T15:23:24.614-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[],"slug":"DMTCP_bringing_interactive_checkpoint_restart_to_Python","translated_slug":"","page_count":null,"language":"en","content_type":"Work","summary":"DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operates in user space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026#39;from the outside\u0026#39; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be requested interactively within a Python session or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process) and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using virtual network client, (ii) a fast/slow technique to use multiple hosts or cores to check one (Cython Behnel S et al 2011 Comput. Sci. Eng. 13 31–39) computation in parallel, and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint feature for locating the cause of a bug.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877687-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877571"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877571/SmartHarvest"><img alt="Research paper thumbnail of SmartHarvest" class="work-thumbnail" src="https://attachments.academia-assets.com/80443367/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877571/SmartHarvest">SmartHarvest</a></div><div class="wp-workCard_item"><span>Proceedings of the Sixteenth European Conference on Computer Systems</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">We can increase the efficiency of public cloud datacenters by harvesting allocated but temporaril...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">We can increase the efficiency of public cloud datacenters by harvesting allocated but temporarily idling CPU cores from customer virtual machines (VMs) to run batch or analytics workloads. Even small efficiency gains translate into substantial savings, since provisioning and operating a datacenter costs hundreds of millions of dollars per year. The main challenge is to harvest idle cores with little or no impact on customer VMs, which could be running latency-sensitive services and are essentially black-boxes to the cloud provider. We introduce ElasticVM, a new VM type that can run batch workloads cheaply using mainly harvested cores. We also propose SmartHarvest, a system that dynamically manages the number of cores available to ElasticVMs in each fine-grained time window. SmartHarvest uses online learning to predict the core demand of primary, customer VMs and compute the number of cores that can be safely harvested. Our results show that SmartHarvest can harvest a significant amount of CPU resources without increasing the 99th-percentile tail latency of latency-critical primary workloads by more than 10%. Unlike static harvesting techniques</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="3bab90b5ec720b53f4ff1b52bc8f4126" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443367,&quot;asset_id&quot;:70877571,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443367/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877571"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877571"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877571; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877571]").text(description); $(".js-view-count[data-work-id=70877571]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877571; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877571']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "3bab90b5ec720b53f4ff1b52bc8f4126" } } $('.js-work-strip[data-work-id=70877571]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877571,"title":"SmartHarvest","translated_title":"","metadata":{"publisher":"ACM","ai_title_tag":"Smart Harvesting of Idle Cloud CPU Cores","grobid_abstract":"We can increase the efficiency of public cloud datacenters by harvesting allocated but temporarily idling CPU cores from customer virtual machines (VMs) to run batch or analytics workloads. Even small efficiency gains translate into substantial savings, since provisioning and operating a datacenter costs hundreds of millions of dollars per year. The main challenge is to harvest idle cores with little or no impact on customer VMs, which could be running latency-sensitive services and are essentially black-boxes to the cloud provider. We introduce ElasticVM, a new VM type that can run batch workloads cheaply using mainly harvested cores. We also propose SmartHarvest, a system that dynamically manages the number of cores available to ElasticVMs in each fine-grained time window. SmartHarvest uses online learning to predict the core demand of primary, customer VMs and compute the number of cores that can be safely harvested. Our results show that SmartHarvest can harvest a significant amount of CPU resources without increasing the 99th-percentile tail latency of latency-critical primary workloads by more than 10%. Unlike static harvesting techniques","publication_name":"Proceedings of the Sixteenth European Conference on Computer Systems","grobid_abstract_attachment_id":80443367},"translated_abstract":null,"internal_url":"https://www.academia.edu/70877571/SmartHarvest","translated_internal_url":"","created_at":"2022-02-07T15:22:22.514-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443367,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443367/thumbnails/1.jpg","file_name":"eurosys21.pdf","download_url":"https://www.academia.edu/attachments/80443367/download_file","bulk_download_file_name":"SmartHarvest.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443367/eurosys21-libre.pdf?1644277588=\u0026response-content-disposition=attachment%3B+filename%3DSmartHarvest.pdf\u0026Expires=1743681117\u0026Signature=BWTqa-8mVQM77Ckw3AQk2lnrfBeOjOPbF-tnWf~s7adzI5S3sAxaPrSRRJs9TrNZhb8r9~GxFIE60dIJ1A0~8R93C~vLYrNzRc03ULDPfCOmbOy~FavHVZsI6ESyUAZgL~35ii-1A8NSsmufDcH1Jwn0F4zYGtRESzHkEO5liSv1oUemnQLfAKOZmumvUX8Qd38zHb5bL1swJx2PxBMwjmdWW-ZOg2uqL~rM4z~Vs0s7G6W53~IB7rpauiaj0fVvD31Oxh82wUyDce4oEMbLK9FslDT0sVhEV-bhnzgJc5hEVGp9QctSqWRKVkToaIKtS83AWjyHoErMCrgLHNoL8w__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"SmartHarvest","translated_slug":"","page_count":16,"language":"en","content_type":"Work","summary":"We can increase the efficiency of public cloud datacenters by harvesting allocated but temporarily idling CPU cores from customer virtual machines (VMs) to run batch or analytics workloads. Even small efficiency gains translate into substantial savings, since provisioning and operating a datacenter costs hundreds of millions of dollars per year. The main challenge is to harvest idle cores with little or no impact on customer VMs, which could be running latency-sensitive services and are essentially black-boxes to the cloud provider. We introduce ElasticVM, a new VM type that can run batch workloads cheaply using mainly harvested cores. We also propose SmartHarvest, a system that dynamically manages the number of cores available to ElasticVMs in each fine-grained time window. SmartHarvest uses online learning to predict the core demand of primary, customer VMs and compute the number of cores that can be safely harvested. Our results show that SmartHarvest can harvest a significant amount of CPU resources without increasing the 99th-percentile tail latency of latency-critical primary workloads by more than 10%. Unlike static harvesting techniques","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443367,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443367/thumbnails/1.jpg","file_name":"eurosys21.pdf","download_url":"https://www.academia.edu/attachments/80443367/download_file","bulk_download_file_name":"SmartHarvest.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443367/eurosys21-libre.pdf?1644277588=\u0026response-content-disposition=attachment%3B+filename%3DSmartHarvest.pdf\u0026Expires=1743681117\u0026Signature=BWTqa-8mVQM77Ckw3AQk2lnrfBeOjOPbF-tnWf~s7adzI5S3sAxaPrSRRJs9TrNZhb8r9~GxFIE60dIJ1A0~8R93C~vLYrNzRc03ULDPfCOmbOy~FavHVZsI6ESyUAZgL~35ii-1A8NSsmufDcH1Jwn0F4zYGtRESzHkEO5liSv1oUemnQLfAKOZmumvUX8Qd38zHb5bL1swJx2PxBMwjmdWW-ZOg2uqL~rM4z~Vs0s7G6W53~IB7rpauiaj0fVvD31Oxh82wUyDce4oEMbLK9FslDT0sVhEV-bhnzgJc5hEVGp9QctSqWRKVkToaIKtS83AWjyHoErMCrgLHNoL8w__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"}],"urls":[{"id":17437689,"url":"https://dl.acm.org/doi/pdf/10.1145/3447786.3456225"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877571-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="26279522"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" rel="nofollow" href="https://www.academia.edu/26279522/Urdb"><img alt="Research paper thumbnail of Urdb" class="work-thumbnail" src="https://a.academia-assets.com/images/blank-paper.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title">Urdb</div><div class="wp-workCard_item wp-workCard--coauthors"><span>by </span><span><a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/TylerDeniston">Tyler Deniston</a>, <a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/KapilArya2">Kapil Arya</a>, and <a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/AnaMariaVisan3">Ana-Maria Visan</a></span></div><div class="wp-workCard_item"><span>Proceedings of the 6th Workshop on Programming Languages and Operating Systems - PLOS &#39;11</span><span>, 2011</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced ba...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced based on checkpoint/re-execute. It adds reversibility to a debugger, while still placing the end user within the familiar environment of their preferred debugger. The URDB software layer currently includes modes that understand the syntax for four debuggers: GDB for C/C++/Java/Fortran, Python (pdb), MATLAB, and Perl (perl</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="26279522"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="26279522"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 26279522; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=26279522]").text(description); $(".js-view-count[data-work-id=26279522]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 26279522; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='26279522']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (false){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "-1" } } $('.js-work-strip[data-work-id=26279522]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":26279522,"title":"Urdb","translated_title":"","metadata":{"abstract":"Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced based on checkpoint/re-execute. It adds reversibility to a debugger, while still placing the end user within the familiar environment of their preferred debugger. The URDB software layer currently includes modes that understand the syntax for four debuggers: GDB for C/C++/Java/Fortran, Python (pdb), MATLAB, and Perl (perl","publication_date":{"day":null,"month":null,"year":2011,"errors":{}},"publication_name":"Proceedings of the 6th Workshop on Programming Languages and Operating Systems - PLOS '11"},"translated_abstract":"Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced based on checkpoint/re-execute. It adds reversibility to a debugger, while still placing the end user within the familiar environment of their preferred debugger. The URDB software layer currently includes modes that understand the syntax for four debuggers: GDB for C/C++/Java/Fortran, Python (pdb), MATLAB, and Perl (perl","internal_url":"https://www.academia.edu/26279522/Urdb","translated_internal_url":"","created_at":"2016-06-18T05:21:08.498-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":50195091,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[{"id":21451873,"work_id":26279522,"tagging_user_id":50195091,"tagged_user_id":46423,"co_author_invite_id":4805756,"email":"k***7@gmail.com","display_order":0,"name":"Kapil Arya","title":"Urdb"},{"id":21451874,"work_id":26279522,"tagging_user_id":50195091,"tagged_user_id":null,"co_author_invite_id":4805757,"email":"t***r@c6rs.com","display_order":4194304,"name":"Tyler Denniston","title":"Urdb"},{"id":21451875,"work_id":26279522,"tagging_user_id":50195091,"tagged_user_id":49938451,"co_author_invite_id":null,"email":"g***e@ccs.neu.edu","affiliation":"Northeastern University","display_order":6291456,"name":"Gene Cooperman","title":"Urdb"},{"id":21451876,"work_id":26279522,"tagging_user_id":50195091,"tagged_user_id":50013738,"co_author_invite_id":null,"email":"a***n@google.com","display_order":7340032,"name":"Ana-Maria Visan","title":"Urdb"}],"downloadable_attachments":[],"slug":"Urdb","translated_slug":"","page_count":null,"language":"en","content_type":"Work","summary":"Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced based on checkpoint/re-execute. It adds reversibility to a debugger, while still placing the end user within the familiar environment of their preferred debugger. The URDB software layer currently includes modes that understand the syntax for four debuggers: GDB for C/C++/Java/Fortran, Python (pdb), MATLAB, and Perl (perl","owner":{"id":50195091,"first_name":"Tyler","middle_initials":null,"last_name":"Deniston","page_name":"TylerDeniston","domain_name":"independent","created_at":"2016-06-18T05:20:57.618-07:00","display_name":"Tyler Deniston","url":"https://independent.academia.edu/TylerDeniston"},"attachments":[],"research_interests":[{"id":70448,"name":"Fortran","url":"https://www.academia.edu/Documents/in/Fortran"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-26279522-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="26098718"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" rel="nofollow" href="https://www.academia.edu/26098718/Temporal_Debugging_Automating_Time_Travel_Debugging_with_URDB"><img alt="Research paper thumbnail of Temporal Debugging: Automating Time Travel Debugging with URDB" class="work-thumbnail" src="https://a.academia-assets.com/images/blank-paper.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title">Temporal Debugging: Automating Time Travel Debugging with URDB</div><div class="wp-workCard_item wp-workCard--coauthors"><span>by </span><span><a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/AnaMariaVisan3">Ana-Maria Visan</a> and <a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/KapilArya2">Kapil Arya</a></span></div><div class="wp-workCard_item"><span>ccs.neu.edu</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">... Xin Dong Northeastern University xindong@ccs.neu.edu Kapil Arya Tyler Denniston Praveen S. So...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">... Xin Dong Northeastern University <a href="mailto:xindong@ccs.neu.edu" rel="nofollow">xindong@ccs.neu.edu</a> Kapil Arya Tyler Denniston Praveen S. Solanki Gene Cooperman Northeastern University {kapil,tyler,psolanki,gene}@ccs.neu.edu Abstract This work addresses two classical problems in debugging. ...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="26098718"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="26098718"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 26098718; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=26098718]").text(description); $(".js-view-count[data-work-id=26098718]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 26098718; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='26098718']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (false){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "-1" } } $('.js-work-strip[data-work-id=26098718]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":26098718,"title":"Temporal Debugging: Automating Time Travel Debugging with URDB","translated_title":"","metadata":{"abstract":"... Xin Dong Northeastern University xindong@ccs.neu.edu Kapil Arya Tyler Denniston Praveen S. Solanki Gene Cooperman Northeastern University {kapil,tyler,psolanki,gene}@ccs.neu.edu Abstract This work addresses two classical problems in debugging. ...","publication_name":"ccs.neu.edu"},"translated_abstract":"... Xin Dong Northeastern University xindong@ccs.neu.edu Kapil Arya Tyler Denniston Praveen S. Solanki Gene Cooperman Northeastern University {kapil,tyler,psolanki,gene}@ccs.neu.edu Abstract This work addresses two classical problems in debugging. ...","internal_url":"https://www.academia.edu/26098718/Temporal_Debugging_Automating_Time_Travel_Debugging_with_URDB","translated_internal_url":"","created_at":"2016-06-13T17:28:51.638-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":50013738,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[{"id":21246036,"work_id":26098718,"tagging_user_id":50013738,"tagged_user_id":46423,"co_author_invite_id":null,"email":"k***7@gmail.com","display_order":0,"name":"Kapil Arya","title":"Temporal Debugging: Automating Time Travel Debugging with URDB"},{"id":21246042,"work_id":26098718,"tagging_user_id":50013738,"tagged_user_id":null,"co_author_invite_id":4750018,"email":"t***r@csail.mit.edu","display_order":4194304,"name":"Tyler Denniston","title":"Temporal Debugging: Automating Time Travel Debugging with URDB"},{"id":21246044,"work_id":26098718,"tagging_user_id":50013738,"tagged_user_id":null,"co_author_invite_id":4762587,"email":"a***x@locomizer.com","display_order":6291456,"name":"A. Polyakov","title":"Temporal Debugging: Automating Time Travel Debugging with URDB"}],"downloadable_attachments":[],"slug":"Temporal_Debugging_Automating_Time_Travel_Debugging_with_URDB","translated_slug":"","page_count":null,"language":"en","content_type":"Work","summary":"... Xin Dong Northeastern University xindong@ccs.neu.edu Kapil Arya Tyler Denniston Praveen S. Solanki Gene Cooperman Northeastern University {kapil,tyler,psolanki,gene}@ccs.neu.edu Abstract This work addresses two classical problems in debugging. ...","owner":{"id":50013738,"first_name":"Ana-Maria","middle_initials":null,"last_name":"Visan","page_name":"AnaMariaVisan3","domain_name":"independent","created_at":"2016-06-13T17:27:43.265-07:00","display_name":"Ana-Maria Visan","url":"https://independent.academia.edu/AnaMariaVisan3"},"attachments":[],"research_interests":[],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-26098718-figures'); } }); </script> </div><div class="profile--tab_content_container js-tab-pane tab-pane" data-section-id="391205" id="papers"><div class="js-work-strip profile--work_container" data-work-id="117958233"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/117958233/Transparent_Checkpoint_Restart_over_InfiniBand"><img alt="Research paper thumbnail of Transparent Checkpoint-Restart over InfiniBand" class="work-thumbnail" src="https://attachments.academia-assets.com/113694222/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/117958233/Transparent_Checkpoint_Restart_over_InfiniBand">Transparent Checkpoint-Restart over InfiniBand</a></div><div class="wp-workCard_item"><span>arXiv (Cornell University)</span><span>, Dec 13, 2013</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">InfiniBand is widely used for low-latency, high-throughput cluster computing. Saving the state of...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">InfiniBand is widely used for low-latency, high-throughput cluster computing. Saving the state of the InfiniBand network as part of distributed checkpointing has been a long-standing challenge for researchers. Because of a lack of a solution, typical MPI implementations have included custom checkpoint-restart services that &quot;tear down&quot; the network, checkpoint each node as if the node were a standalone computer, and then reconnect the network again. We present the first example of transparent, system-initiated checkpoint-restart that directly supports In-finiBand. The new approach is independent of any particular Linux kernel, thus simplifying the current practice of using a kernel-based module, such as BLCR. This direct approach results in checkpoints that are found to be faster than with the use of a checkpoint-restart service. The generality of this approach is shown not only by checkpointing an MPI computation, but also a native UPC computation (Berkeley Unified Parallel C), which does not use MPI. Scalability is shown by checkpointing 2,048 MPI processes across 128 nodes (with 16 cores per node). In addition, a cost-effective debugging approach is also enabled, in which a checkpoint image from an InfiniBand-based production cluster is copied to a local Ethernet-based cluster, where it can be restarted and an interactive debugger can be attached to it. This work is based on a plugin that extends the DMTCP (Distributed MultiThreaded CheckPointing) checkpoint-restart package.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="92e1eb3446b39a3dd645808d227703a8" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:113694222,&quot;asset_id&quot;:117958233,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/113694222/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="117958233"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="117958233"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 117958233; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=117958233]").text(description); $(".js-view-count[data-work-id=117958233]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 117958233; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='117958233']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "92e1eb3446b39a3dd645808d227703a8" } } $('.js-work-strip[data-work-id=117958233]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":117958233,"title":"Transparent Checkpoint-Restart over InfiniBand","translated_title":"","metadata":{"publisher":"Cornell University","grobid_abstract":"InfiniBand is widely used for low-latency, high-throughput cluster computing. Saving the state of the InfiniBand network as part of distributed checkpointing has been a long-standing challenge for researchers. Because of a lack of a solution, typical MPI implementations have included custom checkpoint-restart services that \"tear down\" the network, checkpoint each node as if the node were a standalone computer, and then reconnect the network again. We present the first example of transparent, system-initiated checkpoint-restart that directly supports In-finiBand. The new approach is independent of any particular Linux kernel, thus simplifying the current practice of using a kernel-based module, such as BLCR. This direct approach results in checkpoints that are found to be faster than with the use of a checkpoint-restart service. The generality of this approach is shown not only by checkpointing an MPI computation, but also a native UPC computation (Berkeley Unified Parallel C), which does not use MPI. Scalability is shown by checkpointing 2,048 MPI processes across 128 nodes (with 16 cores per node). In addition, a cost-effective debugging approach is also enabled, in which a checkpoint image from an InfiniBand-based production cluster is copied to a local Ethernet-based cluster, where it can be restarted and an interactive debugger can be attached to it. This work is based on a plugin that extends the DMTCP (Distributed MultiThreaded CheckPointing) checkpoint-restart package.","publication_date":{"day":13,"month":12,"year":2013,"errors":{}},"publication_name":"arXiv (Cornell University)","grobid_abstract_attachment_id":113694221},"translated_abstract":null,"internal_url":"https://www.academia.edu/117958233/Transparent_Checkpoint_Restart_over_InfiniBand","translated_internal_url":"","created_at":"2024-04-23T15:13:55.878-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":113694222,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694222/thumbnails/1.jpg","file_name":"1312.pdf","download_url":"https://www.academia.edu/attachments/113694222/download_file","bulk_download_file_name":"Transparent_Checkpoint_Restart_over_Infi.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694222/1312-libre.pdf?1713911126=\u0026response-content-disposition=attachment%3B+filename%3DTransparent_Checkpoint_Restart_over_Infi.pdf\u0026Expires=1743681116\u0026Signature=Vw5dKUWuajiZISQsaH40UrO8TgeuT3wDjIy2X85aK1ttBBlWCsYSz6KT49-AKj36APKzpX1-Ou-XaIy1EbRGptB82acZm-mte6y370-f-wLkCYfscbinVcx63SzHX69-jZ8fsvcpXgdlEBHKiuMdI0ZMZrmRFI4tG61bXtkD3O9bHh~wjTuQzBs81HF~IkaKY5LV3vwXpxS~qe5g5Mc3JC9PorDysfjhY3jYdj8eHVu2sos3sWPdQSxthiiG-6UoPudQM3mBmfqQJSWbgLgahXM3IOcz8uZDwiyFHg21zCZ4fi6BNRE--bto0U3Oq6Jas2764d0lIstEeiPa5GM05A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Transparent_Checkpoint_Restart_over_InfiniBand","translated_slug":"","page_count":22,"language":"en","content_type":"Work","summary":"InfiniBand is widely used for low-latency, high-throughput cluster computing. Saving the state of the InfiniBand network as part of distributed checkpointing has been a long-standing challenge for researchers. Because of a lack of a solution, typical MPI implementations have included custom checkpoint-restart services that \"tear down\" the network, checkpoint each node as if the node were a standalone computer, and then reconnect the network again. We present the first example of transparent, system-initiated checkpoint-restart that directly supports In-finiBand. The new approach is independent of any particular Linux kernel, thus simplifying the current practice of using a kernel-based module, such as BLCR. This direct approach results in checkpoints that are found to be faster than with the use of a checkpoint-restart service. The generality of this approach is shown not only by checkpointing an MPI computation, but also a native UPC computation (Berkeley Unified Parallel C), which does not use MPI. Scalability is shown by checkpointing 2,048 MPI processes across 128 nodes (with 16 cores per node). In addition, a cost-effective debugging approach is also enabled, in which a checkpoint image from an InfiniBand-based production cluster is copied to a local Ethernet-based cluster, where it can be restarted and an interactive debugger can be attached to it. This work is based on a plugin that extends the DMTCP (Distributed MultiThreaded CheckPointing) checkpoint-restart package.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":113694222,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694222/thumbnails/1.jpg","file_name":"1312.pdf","download_url":"https://www.academia.edu/attachments/113694222/download_file","bulk_download_file_name":"Transparent_Checkpoint_Restart_over_Infi.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694222/1312-libre.pdf?1713911126=\u0026response-content-disposition=attachment%3B+filename%3DTransparent_Checkpoint_Restart_over_Infi.pdf\u0026Expires=1743681116\u0026Signature=Vw5dKUWuajiZISQsaH40UrO8TgeuT3wDjIy2X85aK1ttBBlWCsYSz6KT49-AKj36APKzpX1-Ou-XaIy1EbRGptB82acZm-mte6y370-f-wLkCYfscbinVcx63SzHX69-jZ8fsvcpXgdlEBHKiuMdI0ZMZrmRFI4tG61bXtkD3O9bHh~wjTuQzBs81HF~IkaKY5LV3vwXpxS~qe5g5Mc3JC9PorDysfjhY3jYdj8eHVu2sos3sWPdQSxthiiG-6UoPudQM3mBmfqQJSWbgLgahXM3IOcz8uZDwiyFHg21zCZ4fi6BNRE--bto0U3Oq6Jas2764d0lIstEeiPa5GM05A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":113694221,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694221/thumbnails/1.jpg","file_name":"1312.pdf","download_url":"https://www.academia.edu/attachments/113694221/download_file","bulk_download_file_name":"Transparent_Checkpoint_Restart_over_Infi.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694221/1312-libre.pdf?1713911125=\u0026response-content-disposition=attachment%3B+filename%3DTransparent_Checkpoint_Restart_over_Infi.pdf\u0026Expires=1743681116\u0026Signature=YxeUWCf8xCVNpEsm~xB3WMypwacgzb79h1BC8l~tGuC1VhTqCT71Z8ySFIZauU6lClKzMY1XGPes26DKjWtJO4SGG6MLkOnRKq9Qiv-aWsLnddptRe0iakMuuSy8s5Rvb5-k3FVEFJBN5qju6TabN2hSl1HvMYSGrLTmQAMbNIDOdY5ByU9RLQN-7IErkryLs7DFB79yYR9SzIZ~xGxyqvIyo2rEKY5ONzk5Ehwq9qv2TiI~1p1xGa1Q-cK7WvzEmDQ8lQCAT~ewlVuTy9xSvMPhMgPi~MK1uA8i9ImqEjLAnuxgMa8Y9gABikE1VWArobdgKrETDUU6pmDw~xGrSQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":442,"name":"Parallel Computing","url":"https://www.academia.edu/Documents/in/Parallel_Computing"},{"id":44244,"name":"OPERATING SYSTEM","url":"https://www.academia.edu/Documents/in/OPERATING_SYSTEM"},{"id":75768,"name":"MPI","url":"https://www.academia.edu/Documents/in/MPI"},{"id":377043,"name":"Scalability","url":"https://www.academia.edu/Documents/in/Scalability"},{"id":491492,"name":"InfiniBand","url":"https://www.academia.edu/Documents/in/InfiniBand"},{"id":983490,"name":"Operating Systems (In Computer Science)","url":"https://www.academia.edu/Documents/in/Operating_Systems_In_Computer_Science_"},{"id":1188947,"name":"D","url":"https://www.academia.edu/Documents/in/D-351414216"}],"urls":[{"id":41341950,"url":"http://arxiv.org/pdf/1312.3938"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-117958233-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="117958232"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/117958232/Adapting_the_DMTCP_Plugin_Model_for_Checkpointing_of_Hardware_Emulation"><img alt="Research paper thumbnail of Adapting the DMTCP Plugin Model for Checkpointing of Hardware Emulation" class="work-thumbnail" src="https://attachments.academia-assets.com/113694220/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/117958232/Adapting_the_DMTCP_Plugin_Model_for_Checkpointing_of_Hardware_Emulation">Adapting the DMTCP Plugin Model for Checkpointing of Hardware Emulation</a></div><div class="wp-workCard_item"><span>ArXiv</span><span>, 2017</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Checkpoint-restart is now a mature technology. It allows a user to save and later restore the sta...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Checkpoint-restart is now a mature technology. It allows a user to save and later restore the state of a running process. The new plugin model for the upcoming version 3.0 of DMTCP (Distributed MultiThreaded Checkpointing) is described here. This plugin model allows a target application to disconnect from the hardware emulator at checkpoint time and then re-connect to a possibly different hardware emulator at the time of restart. The DMTCP plugin model is important in allowing three distinct parties to seamlessly inter-operate. The three parties are: the EDA designer, who is concerned with formal verification of a circuit design; the DMTCP developers, who are concerned with providing transparent checkpointing during the circuit emulation; and the hardware emulator vendor, who provides a plugin library that responds to checkpoint, restart, and other events. The new plugin model is an example of process-level virtualization: virtualization of external abstractions from within a proces...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="677f230684aec6c8dfcd767d8bdf7399" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:113694220,&quot;asset_id&quot;:117958232,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/113694220/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="117958232"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="117958232"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 117958232; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=117958232]").text(description); $(".js-view-count[data-work-id=117958232]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 117958232; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='117958232']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "677f230684aec6c8dfcd767d8bdf7399" } } $('.js-work-strip[data-work-id=117958232]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":117958232,"title":"Adapting the DMTCP Plugin Model for Checkpointing of Hardware Emulation","translated_title":"","metadata":{"abstract":"Checkpoint-restart is now a mature technology. It allows a user to save and later restore the state of a running process. The new plugin model for the upcoming version 3.0 of DMTCP (Distributed MultiThreaded Checkpointing) is described here. This plugin model allows a target application to disconnect from the hardware emulator at checkpoint time and then re-connect to a possibly different hardware emulator at the time of restart. The DMTCP plugin model is important in allowing three distinct parties to seamlessly inter-operate. The three parties are: the EDA designer, who is concerned with formal verification of a circuit design; the DMTCP developers, who are concerned with providing transparent checkpointing during the circuit emulation; and the hardware emulator vendor, who provides a plugin library that responds to checkpoint, restart, and other events. The new plugin model is an example of process-level virtualization: virtualization of external abstractions from within a proces...","publisher":"ArXiv","ai_title_tag":"DMTCP Plugin Model for Hardware Emulation","publication_date":{"day":null,"month":null,"year":2017,"errors":{}},"publication_name":"ArXiv"},"translated_abstract":"Checkpoint-restart is now a mature technology. It allows a user to save and later restore the state of a running process. The new plugin model for the upcoming version 3.0 of DMTCP (Distributed MultiThreaded Checkpointing) is described here. This plugin model allows a target application to disconnect from the hardware emulator at checkpoint time and then re-connect to a possibly different hardware emulator at the time of restart. The DMTCP plugin model is important in allowing three distinct parties to seamlessly inter-operate. The three parties are: the EDA designer, who is concerned with formal verification of a circuit design; the DMTCP developers, who are concerned with providing transparent checkpointing during the circuit emulation; and the hardware emulator vendor, who provides a plugin library that responds to checkpoint, restart, and other events. The new plugin model is an example of process-level virtualization: virtualization of external abstractions from within a proces...","internal_url":"https://www.academia.edu/117958232/Adapting_the_DMTCP_Plugin_Model_for_Checkpointing_of_Hardware_Emulation","translated_internal_url":"","created_at":"2024-04-23T15:13:55.593-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":113694220,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694220/thumbnails/1.jpg","file_name":"1703.00897v1.pdf","download_url":"https://www.academia.edu/attachments/113694220/download_file","bulk_download_file_name":"Adapting_the_DMTCP_Plugin_Model_for_Chec.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694220/1703.00897v1-libre.pdf?1713911121=\u0026response-content-disposition=attachment%3B+filename%3DAdapting_the_DMTCP_Plugin_Model_for_Chec.pdf\u0026Expires=1743681116\u0026Signature=TPCYVG37lq2mnb15OKVsC7YgZSS92NjA7eloh2UT4QWMS70QcLMCXQJCQ4A5y00HWu9yr~gQBf75-mcICbIt5NLXgWDm53-b4oXAkYO1PdvXkLnMgFR4gYyqtQ54EJN4EqdvR-C5mHAduIDW7q5b6rJAnwx8~f8dm9pKCXMbyV3h9S2hhxVUK75in3QO71loUu2UENs3ZSGqfSrCBBv4oKo93WTB6onMM-zUdDCcWj~Tnl9XFnBxIvEiLXEvegPDvyEPiNfRxtFhET3B5pwCLwvKdcPZG8LAtim-kiBe-K3vcKhdvXMdqY2u6SwOXEsdLvZiag1vwTOoLQhOEOG7Vw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Adapting_the_DMTCP_Plugin_Model_for_Checkpointing_of_Hardware_Emulation","translated_slug":"","page_count":6,"language":"en","content_type":"Work","summary":"Checkpoint-restart is now a mature technology. It allows a user to save and later restore the state of a running process. The new plugin model for the upcoming version 3.0 of DMTCP (Distributed MultiThreaded Checkpointing) is described here. This plugin model allows a target application to disconnect from the hardware emulator at checkpoint time and then re-connect to a possibly different hardware emulator at the time of restart. The DMTCP plugin model is important in allowing three distinct parties to seamlessly inter-operate. The three parties are: the EDA designer, who is concerned with formal verification of a circuit design; the DMTCP developers, who are concerned with providing transparent checkpointing during the circuit emulation; and the hardware emulator vendor, who provides a plugin library that responds to checkpoint, restart, and other events. The new plugin model is an example of process-level virtualization: virtualization of external abstractions from within a proces...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":113694220,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694220/thumbnails/1.jpg","file_name":"1703.00897v1.pdf","download_url":"https://www.academia.edu/attachments/113694220/download_file","bulk_download_file_name":"Adapting_the_DMTCP_Plugin_Model_for_Chec.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694220/1703.00897v1-libre.pdf?1713911121=\u0026response-content-disposition=attachment%3B+filename%3DAdapting_the_DMTCP_Plugin_Model_for_Chec.pdf\u0026Expires=1743681116\u0026Signature=TPCYVG37lq2mnb15OKVsC7YgZSS92NjA7eloh2UT4QWMS70QcLMCXQJCQ4A5y00HWu9yr~gQBf75-mcICbIt5NLXgWDm53-b4oXAkYO1PdvXkLnMgFR4gYyqtQ54EJN4EqdvR-C5mHAduIDW7q5b6rJAnwx8~f8dm9pKCXMbyV3h9S2hhxVUK75in3QO71loUu2UENs3ZSGqfSrCBBv4oKo93WTB6onMM-zUdDCcWj~Tnl9XFnBxIvEiLXEvegPDvyEPiNfRxtFhET3B5pwCLwvKdcPZG8LAtim-kiBe-K3vcKhdvXMdqY2u6SwOXEsdLvZiag1vwTOoLQhOEOG7Vw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":113694219,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694219/thumbnails/1.jpg","file_name":"1703.00897v1.pdf","download_url":"https://www.academia.edu/attachments/113694219/download_file","bulk_download_file_name":"Adapting_the_DMTCP_Plugin_Model_for_Chec.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694219/1703.00897v1-libre.pdf?1713911119=\u0026response-content-disposition=attachment%3B+filename%3DAdapting_the_DMTCP_Plugin_Model_for_Chec.pdf\u0026Expires=1743681116\u0026Signature=YIkwMCN5cNbC51e7shniWopfbX3uIXQJigZQSMq0~RqVZ4EExQPa2O-xJkrZklVjXQsB8sExnIwEuigvmfOFXVbriflnpzR3GTvE2CkVQbxKOyp2a5vBO5l5X~XEaCnsebstSCt1oHwB9VbyO6IIU-zIndjHexF5e8wPF6oG~UIqqauiQgaygFienDLcKtFkyeCAZ9~A5LappD5Y~ksoyCQLoqUQAklPY8~ZOu~mrDk5lacRxYVXpDQtjLZTkK3ZvR73fKFfKziunGHFBvGphMDjrHhmNCEefE30MuF3Wl741GU5wTwhodG8tpDmjaXXAxUDqifym~kEqgydhDiY1A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":44244,"name":"OPERATING SYSTEM","url":"https://www.academia.edu/Documents/in/OPERATING_SYSTEM"},{"id":154848,"name":"Emulation","url":"https://www.academia.edu/Documents/in/Emulation"},{"id":1148030,"name":"Embedded System","url":"https://www.academia.edu/Documents/in/Embedded_System"},{"id":3193313,"name":"arXiv","url":"https://www.academia.edu/Documents/in/arXiv"}],"urls":[{"id":41341949,"url":"https://arxiv.org/pdf/1703.00897v1.pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-117958232-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="117958231"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/117958231/DMTCP_Scalable_User_Level_Transparent_Checkpointing_for_Cluster_Computations"><img alt="Research paper thumbnail of DMTCP: Scalable User-Level Transparent Checkpointing for Cluster Computations" class="work-thumbnail" src="https://attachments.academia-assets.com/113694238/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/117958231/DMTCP_Scalable_User_Level_Transparent_Checkpointing_for_Cluster_Computations">DMTCP: Scalable User-Level Transparent Checkpointing for Cluster Computations</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">As the size of clusters increases, failures are becoming increasingly frequent. Applications must...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">As the size of clusters increases, failures are becoming increasingly frequent. Applications must become fault tolerant if they are to run for extended periods of time. We present DMTCP (Distributed MultiThreaded CheckPointing), the first user-level distributed checkpointing package not dependent on a specific message passing library. This contrasts with existing approaches either specific to libraries such as MPI or requiring kernel modification. DMTCP provides fault tolerance through checkpointing. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads. DMTCP automatically accounts for TCP/IP sockets, UNIX domain sockets, pipes, ptys (pseudo-terminals), signal handlers, ordinary file descriptors, shared file descriptors, and other operating system artifacts. We demonstrate checkpointing and restart of applications communicating through MPICH2, OpenMPI, and sockets directly. These applications were written with a variety of languages including Fortran, C, C++, and Python. Results show that checkpoint time remains nearly constant as the number of nodes increases.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="17b902baca41113359321f78cf3d844d" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:113694238,&quot;asset_id&quot;:117958231,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/113694238/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="117958231"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="117958231"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 117958231; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=117958231]").text(description); $(".js-view-count[data-work-id=117958231]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 117958231; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='117958231']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "17b902baca41113359321f78cf3d844d" } } $('.js-work-strip[data-work-id=117958231]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":117958231,"title":"DMTCP: Scalable User-Level Transparent Checkpointing for Cluster Computations","translated_title":"","metadata":{"ai_title_tag":"DMTCP: Scalable Checkpointing for Clusters","grobid_abstract":"As the size of clusters increases, failures are becoming increasingly frequent. Applications must become fault tolerant if they are to run for extended periods of time. We present DMTCP (Distributed MultiThreaded CheckPointing), the first user-level distributed checkpointing package not dependent on a specific message passing library. This contrasts with existing approaches either specific to libraries such as MPI or requiring kernel modification. DMTCP provides fault tolerance through checkpointing. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads. DMTCP automatically accounts for TCP/IP sockets, UNIX domain sockets, pipes, ptys (pseudo-terminals), signal handlers, ordinary file descriptors, shared file descriptors, and other operating system artifacts. We demonstrate checkpointing and restart of applications communicating through MPICH2, OpenMPI, and sockets directly. These applications were written with a variety of languages including Fortran, C, C++, and Python. Results show that checkpoint time remains nearly constant as the number of nodes increases.","publication_date":{"day":null,"month":null,"year":2008,"errors":{}},"grobid_abstract_attachment_id":113694238},"translated_abstract":null,"internal_url":"https://www.academia.edu/117958231/DMTCP_Scalable_User_Level_Transparent_Checkpointing_for_Cluster_Computations","translated_internal_url":"","created_at":"2024-04-23T15:13:54.477-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":113694238,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694238/thumbnails/1.jpg","file_name":"download.pdf","download_url":"https://www.academia.edu/attachments/113694238/download_file","bulk_download_file_name":"DMTCP_Scalable_User_Level_Transparent_Ch.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694238/download-libre.pdf?1713911122=\u0026response-content-disposition=attachment%3B+filename%3DDMTCP_Scalable_User_Level_Transparent_Ch.pdf\u0026Expires=1743681116\u0026Signature=XqZOUB7Xxn6n7GlykWGIrZtFaHjHk2sBYjjZPehJSE9hp~FjLi3t4yQ7pYH~DINVF5TPICKic-gJFkfvmGLLSiRqt1HmHF8UIQ8SDaxrR42IiSs36a05LYK~uSaUZFARdegvpiDJVJRfdP~mqNTFVHLP18jgFk8cW3~LtNzof7wF1S--iglqGt~rhlG-zVlJlHAFxNaPprUeXtwBzw6X6Uj-ZMqyBCfaOzNzw0~d8g0DGJ4o09l6VWlmLkTQmB6~-dS~W4TYdWgWn89k86-748KVR-2sIHRjMpIHdwFmgHUiPmd7oBSY5kLlHrCpN-pQyvqfprqd~3EgInk760pngQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"DMTCP_Scalable_User_Level_Transparent_Checkpointing_for_Cluster_Computations","translated_slug":"","page_count":17,"language":"en","content_type":"Work","summary":"As the size of clusters increases, failures are becoming increasingly frequent. Applications must become fault tolerant if they are to run for extended periods of time. We present DMTCP (Distributed MultiThreaded CheckPointing), the first user-level distributed checkpointing package not dependent on a specific message passing library. This contrasts with existing approaches either specific to libraries such as MPI or requiring kernel modification. DMTCP provides fault tolerance through checkpointing. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads. DMTCP automatically accounts for TCP/IP sockets, UNIX domain sockets, pipes, ptys (pseudo-terminals), signal handlers, ordinary file descriptors, shared file descriptors, and other operating system artifacts. We demonstrate checkpointing and restart of applications communicating through MPICH2, OpenMPI, and sockets directly. These applications were written with a variety of languages including Fortran, C, C++, and Python. Results show that checkpoint time remains nearly constant as the number of nodes increases.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":113694238,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694238/thumbnails/1.jpg","file_name":"download.pdf","download_url":"https://www.academia.edu/attachments/113694238/download_file","bulk_download_file_name":"DMTCP_Scalable_User_Level_Transparent_Ch.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694238/download-libre.pdf?1713911122=\u0026response-content-disposition=attachment%3B+filename%3DDMTCP_Scalable_User_Level_Transparent_Ch.pdf\u0026Expires=1743681116\u0026Signature=XqZOUB7Xxn6n7GlykWGIrZtFaHjHk2sBYjjZPehJSE9hp~FjLi3t4yQ7pYH~DINVF5TPICKic-gJFkfvmGLLSiRqt1HmHF8UIQ8SDaxrR42IiSs36a05LYK~uSaUZFARdegvpiDJVJRfdP~mqNTFVHLP18jgFk8cW3~LtNzof7wF1S--iglqGt~rhlG-zVlJlHAFxNaPprUeXtwBzw6X6Uj-ZMqyBCfaOzNzw0~d8g0DGJ4o09l6VWlmLkTQmB6~-dS~W4TYdWgWn89k86-748KVR-2sIHRjMpIHdwFmgHUiPmd7oBSY5kLlHrCpN-pQyvqfprqd~3EgInk760pngQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":442,"name":"Parallel Computing","url":"https://www.academia.edu/Documents/in/Parallel_Computing"},{"id":8137,"name":"Unix","url":"https://www.academia.edu/Documents/in/Unix"},{"id":36300,"name":"Fault Tolerance","url":"https://www.academia.edu/Documents/in/Fault_Tolerance"},{"id":44244,"name":"OPERATING SYSTEM","url":"https://www.academia.edu/Documents/in/OPERATING_SYSTEM"},{"id":59487,"name":"Computation","url":"https://www.academia.edu/Documents/in/Computation"},{"id":377043,"name":"Scalability","url":"https://www.academia.edu/Documents/in/Scalability"},{"id":2038221,"name":"Computer Cluster","url":"https://www.academia.edu/Documents/in/Computer_Cluster"},{"id":3336228,"name":"python programming language","url":"https://www.academia.edu/Documents/in/python_programming_language"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-117958231-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="117958223"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/117958223/System_level_Scalable_Checkpoint_Restart_for_Petascale_Computing"><img alt="Research paper thumbnail of System-level Scalable Checkpoint-Restart for Petascale Computing" class="work-thumbnail" src="https://attachments.academia-assets.com/113694213/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/117958223/System_level_Scalable_Checkpoint_Restart_for_Petascale_Computing">System-level Scalable Checkpoint-Restart for Petascale Computing</a></div><div class="wp-workCard_item"><span>arXiv (Cornell University)</span><span>, Jul 27, 2016</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Fault tolerance for the upcoming exascale generation has long been an area of active research. On...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="1fb54398cf1659068c2e70608a9a44c4" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:113694213,&quot;asset_id&quot;:117958223,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/113694213/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="117958223"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="117958223"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 117958223; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=117958223]").text(description); $(".js-view-count[data-work-id=117958223]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 117958223; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='117958223']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "1fb54398cf1659068c2e70608a9a44c4" } } $('.js-work-strip[data-work-id=117958223]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":117958223,"title":"System-level Scalable Checkpoint-Restart for Petascale Computing","translated_title":"","metadata":{"publisher":"Cornell University","ai_title_tag":"Scalable Checkpoint-Restart for Petascale HPC","grobid_abstract":"Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.","publication_date":{"day":27,"month":7,"year":2016,"errors":{}},"publication_name":"arXiv (Cornell University)","grobid_abstract_attachment_id":113694213},"translated_abstract":null,"internal_url":"https://www.academia.edu/117958223/System_level_Scalable_Checkpoint_Restart_for_Petascale_Computing","translated_internal_url":"","created_at":"2024-04-23T15:13:23.548-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":113694213,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694213/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/113694213/download_file","bulk_download_file_name":"System_level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694213/1607-libre.pdf?1713911128=\u0026response-content-disposition=attachment%3B+filename%3DSystem_level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681116\u0026Signature=grFADiIVUOrEjyqtnQi7JgTjvIyGHwogxiRu3lm~C-8-rEwLiM~wke6tk2iee6bH99pZYRcK2oOh6f0M4LMdT1uwQyOVQzlwnmbHRcAP9i8uQUqA1I9xCXFIb7caG6VVoKADRecgY2CoNyr1ouGoR962Kb6SFNgQRtNsmWckN6iURRhvh3nFxrS2FK9Yfg2E37XEqO74ckOqZCErbafRBXomH8FbBcCe9~4hXLYKj3sWMqCseP69UE35tkMakMNz2UGTY4zfkMEXeB4~3jsKJqQ9ULsyFsqJcxFY8M3txOx-s2ynOjANK-WTKbAfRDYgykl4-vtHqE9JUuHpy8SNNw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"System_level_Scalable_Checkpoint_Restart_for_Petascale_Computing","translated_slug":"","page_count":18,"language":"en","content_type":"Work","summary":"Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":113694213,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694213/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/113694213/download_file","bulk_download_file_name":"System_level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694213/1607-libre.pdf?1713911128=\u0026response-content-disposition=attachment%3B+filename%3DSystem_level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681116\u0026Signature=grFADiIVUOrEjyqtnQi7JgTjvIyGHwogxiRu3lm~C-8-rEwLiM~wke6tk2iee6bH99pZYRcK2oOh6f0M4LMdT1uwQyOVQzlwnmbHRcAP9i8uQUqA1I9xCXFIb7caG6VVoKADRecgY2CoNyr1ouGoR962Kb6SFNgQRtNsmWckN6iURRhvh3nFxrS2FK9Yfg2E37XEqO74ckOqZCErbafRBXomH8FbBcCe9~4hXLYKj3sWMqCseP69UE35tkMakMNz2UGTY4zfkMEXeB4~3jsKJqQ9ULsyFsqJcxFY8M3txOx-s2ynOjANK-WTKbAfRDYgykl4-vtHqE9JUuHpy8SNNw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":113694212,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/113694212/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/113694212/download_file","bulk_download_file_name":"System_level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/113694212/1607-libre.pdf?1713911123=\u0026response-content-disposition=attachment%3B+filename%3DSystem_level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681116\u0026Signature=KX3LYkqNVWXGdt8V3TymY~Bo6WKcjDYfK-k6Uahja2Vd4xc~MaKuxWlXi-yiRCdnZJ3n6tFDbvdD8MkZOCfmhR-QcqawRvV4068NP0VaX7-uL4-a-k2PXZhr0Y9LWRvXZEDCKQel2AruubKYnk5yjKdYZFbcPx7rcbJ8zCQKzCg6Jbst4y8Nl8glFGMScVkm4hZSZDg4IdfIDuWFoCvHMJa-uan9ouz9DIiAZEZon9~ooEaTJOJ3QSNExXQbDxnnh-8QA4sQk50OO~1YK40AaQsrIOn-YDBaTrFSy3ru-4DklT8dG7Yqm3FkbpMUnJAXpoVZBtYMlMzqSpCEfET~dQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":442,"name":"Parallel Computing","url":"https://www.academia.edu/Documents/in/Parallel_Computing"},{"id":36300,"name":"Fault Tolerance","url":"https://www.academia.edu/Documents/in/Fault_Tolerance"},{"id":238655,"name":"Implementation","url":"https://www.academia.edu/Documents/in/Implementation"},{"id":377043,"name":"Scalability","url":"https://www.academia.edu/Documents/in/Scalability"},{"id":491492,"name":"InfiniBand","url":"https://www.academia.edu/Documents/in/InfiniBand"},{"id":2141217,"name":"Supercomputer","url":"https://www.academia.edu/Documents/in/Supercomputer"}],"urls":[{"id":41341945,"url":"http://arxiv.org/pdf/1607.07995"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-117958223-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="92504210"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/92504210/Extended_Batch_Sessions_and_Three_Phase_Debugging"><img alt="Research paper thumbnail of Extended Batch Sessions and Three-Phase Debugging" class="work-thumbnail" src="https://attachments.academia-assets.com/95496120/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/92504210/Extended_Batch_Sessions_and_Three_Phase_Debugging">Extended Batch Sessions and Three-Phase Debugging</a></div><div class="wp-workCard_item"><span>Proceedings of the XSEDE16 Conference on Diversity, Big Data, and Science at Scale</span><span>, 2016</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Batch environments are notoriously unfriendly because it&#39;s not easy to interactively diagnose the...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Batch environments are notoriously unfriendly because it&#39;s not easy to interactively diagnose the health of a job. A job may be terminated without warning when it reaches the end of an allotted runtime slot, or it may terminate even sooner due to an unsuspected bug that occurs only at large scale. Two strategies are proposed that take advantage of DMT-CP (Distributed MultiThreaded CheckPointing) for systemlevel checkpointing. First, we describe a three-phase debugging strategy that permits one to interactively debug long-running MPI applications that were developed for noninteractive batch environments. Second, we review how to use the SLURM resource manager capability to easily implement extended batch sessions that overcome the typical limitation of 24 hours maximum for a single batch job on large HPC resources. We argue for greater use of this lesser known capability, as a means to remove the necessity for the application-specific checkpointing found in many longrunning jobs. CCS Concepts •Software and its engineering → Checkpoint / restart; Software testing and debugging;</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="b85581bfed410b8ceec7424b0b1ca574" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:95496120,&quot;asset_id&quot;:92504210,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/95496120/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="92504210"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="92504210"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 92504210; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=92504210]").text(description); $(".js-view-count[data-work-id=92504210]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 92504210; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='92504210']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "b85581bfed410b8ceec7424b0b1ca574" } } $('.js-work-strip[data-work-id=92504210]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":92504210,"title":"Extended Batch Sessions and Three-Phase Debugging","translated_title":"","metadata":{"publisher":"ACM","grobid_abstract":"Batch environments are notoriously unfriendly because it's not easy to interactively diagnose the health of a job. A job may be terminated without warning when it reaches the end of an allotted runtime slot, or it may terminate even sooner due to an unsuspected bug that occurs only at large scale. Two strategies are proposed that take advantage of DMT-CP (Distributed MultiThreaded CheckPointing) for systemlevel checkpointing. First, we describe a three-phase debugging strategy that permits one to interactively debug long-running MPI applications that were developed for noninteractive batch environments. Second, we review how to use the SLURM resource manager capability to easily implement extended batch sessions that overcome the typical limitation of 24 hours maximum for a single batch job on large HPC resources. We argue for greater use of this lesser known capability, as a means to remove the necessity for the application-specific checkpointing found in many longrunning jobs. CCS Concepts •Software and its engineering → Checkpoint / restart; Software testing and debugging;","publication_date":{"day":null,"month":null,"year":2016,"errors":{}},"publication_name":"Proceedings of the XSEDE16 Conference on Diversity, Big Data, and Science at Scale","grobid_abstract_attachment_id":95496120},"translated_abstract":null,"internal_url":"https://www.academia.edu/92504210/Extended_Batch_Sessions_and_Three_Phase_Debugging","translated_internal_url":"","created_at":"2022-12-09T09:54:14.020-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":95496120,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496120/thumbnails/1.jpg","file_name":"2949550.pdf","download_url":"https://www.academia.edu/attachments/95496120/download_file","bulk_download_file_name":"Extended_Batch_Sessions_and_Three_Phase.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496120/2949550-libre.pdf?1670611737=\u0026response-content-disposition=attachment%3B+filename%3DExtended_Batch_Sessions_and_Three_Phase.pdf\u0026Expires=1743681116\u0026Signature=ANXxvFlML1ANNEFpS41RUFcumBmLgW46vb5HJ9By9LIVXYlJldh1DgHJbP7fQTRF017VJQMqj9k6sS4ULCnWsfdc-bRlbVtAc1D7toFJylrjSDvEYHSsCw~9DmoXsVoLRVp5eOnfgujaMTPmGlgWKbQdIAzBmPWGsWRIpdFFJxosyn5OnGG2D7Ih-LcT42BTSQo2IkimzJTZJ-peDmKVZYlTmplHAwADM0Gk7Zm3TsFyVbiaYcxBXt5dK~fKMVI7KksjJqKIFI30f62hvulZqRLpHU19DtN3EpFm2Ju5D~-jd1dY0DDa10jq0lyO2LlOMYx2l1dQ6YqXuUbtNp~Eqg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Extended_Batch_Sessions_and_Three_Phase_Debugging","translated_slug":"","page_count":8,"language":"en","content_type":"Work","summary":"Batch environments are notoriously unfriendly because it's not easy to interactively diagnose the health of a job. A job may be terminated without warning when it reaches the end of an allotted runtime slot, or it may terminate even sooner due to an unsuspected bug that occurs only at large scale. Two strategies are proposed that take advantage of DMT-CP (Distributed MultiThreaded CheckPointing) for systemlevel checkpointing. First, we describe a three-phase debugging strategy that permits one to interactively debug long-running MPI applications that were developed for noninteractive batch environments. Second, we review how to use the SLURM resource manager capability to easily implement extended batch sessions that overcome the typical limitation of 24 hours maximum for a single batch job on large HPC resources. We argue for greater use of this lesser known capability, as a means to remove the necessity for the application-specific checkpointing found in many longrunning jobs. CCS Concepts •Software and its engineering → Checkpoint / restart; Software testing and debugging;","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":95496120,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496120/thumbnails/1.jpg","file_name":"2949550.pdf","download_url":"https://www.academia.edu/attachments/95496120/download_file","bulk_download_file_name":"Extended_Batch_Sessions_and_Three_Phase.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496120/2949550-libre.pdf?1670611737=\u0026response-content-disposition=attachment%3B+filename%3DExtended_Batch_Sessions_and_Three_Phase.pdf\u0026Expires=1743681116\u0026Signature=ANXxvFlML1ANNEFpS41RUFcumBmLgW46vb5HJ9By9LIVXYlJldh1DgHJbP7fQTRF017VJQMqj9k6sS4ULCnWsfdc-bRlbVtAc1D7toFJylrjSDvEYHSsCw~9DmoXsVoLRVp5eOnfgujaMTPmGlgWKbQdIAzBmPWGsWRIpdFFJxosyn5OnGG2D7Ih-LcT42BTSQo2IkimzJTZJ-peDmKVZYlTmplHAwADM0Gk7Zm3TsFyVbiaYcxBXt5dK~fKMVI7KksjJqKIFI30f62hvulZqRLpHU19DtN3EpFm2Ju5D~-jd1dY0DDa10jq0lyO2LlOMYx2l1dQ6YqXuUbtNp~Eqg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":95496121,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496121/thumbnails/1.jpg","file_name":"2949550.pdf","download_url":"https://www.academia.edu/attachments/95496121/download_file","bulk_download_file_name":"Extended_Batch_Sessions_and_Three_Phase.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496121/2949550-libre.pdf?1670611736=\u0026response-content-disposition=attachment%3B+filename%3DExtended_Batch_Sessions_and_Three_Phase.pdf\u0026Expires=1743681116\u0026Signature=FlP7u4U~86SbzDGFHk~aRrus9XBMVgiIUBLgXobmm1P6Vyh6MVjLZrV7xnuWd~honimG-nosJ1AYMhsX8kmBU8QG-6oFAaA3jt1D-jv1q9fspKXVeTFR5dU8Njmi8PW8yWEHaYS1spzbadTsYA~0Mawk6Em-NM7BCzjQeNh~M7lymqcQfFr3ZuMtC1OT7Sq-Gf18flzgFctWSNiVffMAnAJnFR9sTGLkmkBJFR2SJ98ixFL2MLhJcD7JyAS~rWJ~y5FskvR5XtpGnmBASUkZGFJg7VKGfHU8mbjQpR9LeDwKvzhyvpgyWElxM-fsdJ-DhJulQJkQnpc4-CMDkj8ChQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":440,"name":"Distributed Computing","url":"https://www.academia.edu/Documents/in/Distributed_Computing"},{"id":568451,"name":"Batch Processing","url":"https://www.academia.edu/Documents/in/Batch_Processing"},{"id":879152,"name":"Debugging","url":"https://www.academia.edu/Documents/in/Debugging"}],"urls":[{"id":26834961,"url":"https://dl.acm.org/doi/pdf/10.1145/2949550.2949645"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-92504210-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="92504179"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/92504179/Memory_harvesting_VMs_in_cloud_platforms"><img alt="Research paper thumbnail of Memory-harvesting VMs in cloud platforms" class="work-thumbnail" src="https://attachments.academia-assets.com/95496086/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/92504179/Memory_harvesting_VMs_in_cloud_platforms">Memory-harvesting VMs in cloud platforms</a></div><div class="wp-workCard_item"><span>Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems</span><span>, 2022</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Cloud platforms monetize their spare capacity by renting &quot;Spot&quot; virtual machines (VMs) that can b...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Cloud platforms monetize their spare capacity by renting &quot;Spot&quot; virtual machines (VMs) that can be evicted in favor of higher-priority VMs. Recent work has shown that resource-harvesting VMs are more effective at exploiting spare capacity than Spot VMs, while also reducing the number of evictions. However, the prior work focused on harvesting CPU cores while keeping memory size fixed. This wastes a substantial monetization opportunity and may even limit the ability of harvesting VMs to leverage spare cores. Thus, in this paper, we explore memory harvesting and its challenges in real cloud platforms, namely its impact on VM creation time, NUMA spanning, and page fragmentation. We start by characterizing the amount and dynamics of the spare memory in Azure. We then design and implement memory-harvesting VMs (MHVMs), introducing new techniques for memory buffering, batching, and pre-reclamation. To demonstrate the use of MHVMs, we also extend a popular cluster scheduling framework (Hadoop) and a FaaS platform to adapt to them. Our main results show that (1) there is plenty of scope for memory harvesting in real platforms; (2) MHVMs are effective at mitigating the negative impacts of harvesting; and (3) our extensions of Hadoop and FaaS successfully hide the MHVMs&#39; varying memory size from the users&#39; data-processing jobs and functions. We conclude that memory harvesting has great potential for practical deployment and users can save up to 93% of their costs when running workloads on MHVMs. CCS CONCEPTS • Computer systems organization → Cloud computing.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="d7d751c6739209985055ccf14b465ef4" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:95496086,&quot;asset_id&quot;:92504179,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/95496086/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="92504179"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="92504179"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 92504179; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=92504179]").text(description); $(".js-view-count[data-work-id=92504179]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 92504179; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='92504179']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "d7d751c6739209985055ccf14b465ef4" } } $('.js-work-strip[data-work-id=92504179]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":92504179,"title":"Memory-harvesting VMs in cloud platforms","translated_title":"","metadata":{"publisher":"ACM","grobid_abstract":"Cloud platforms monetize their spare capacity by renting \"Spot\" virtual machines (VMs) that can be evicted in favor of higher-priority VMs. Recent work has shown that resource-harvesting VMs are more effective at exploiting spare capacity than Spot VMs, while also reducing the number of evictions. However, the prior work focused on harvesting CPU cores while keeping memory size fixed. This wastes a substantial monetization opportunity and may even limit the ability of harvesting VMs to leverage spare cores. Thus, in this paper, we explore memory harvesting and its challenges in real cloud platforms, namely its impact on VM creation time, NUMA spanning, and page fragmentation. We start by characterizing the amount and dynamics of the spare memory in Azure. We then design and implement memory-harvesting VMs (MHVMs), introducing new techniques for memory buffering, batching, and pre-reclamation. To demonstrate the use of MHVMs, we also extend a popular cluster scheduling framework (Hadoop) and a FaaS platform to adapt to them. Our main results show that (1) there is plenty of scope for memory harvesting in real platforms; (2) MHVMs are effective at mitigating the negative impacts of harvesting; and (3) our extensions of Hadoop and FaaS successfully hide the MHVMs' varying memory size from the users' data-processing jobs and functions. We conclude that memory harvesting has great potential for practical deployment and users can save up to 93% of their costs when running workloads on MHVMs. CCS CONCEPTS • Computer systems organization → Cloud computing.","publication_date":{"day":null,"month":null,"year":2022,"errors":{}},"publication_name":"Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","grobid_abstract_attachment_id":95496085},"translated_abstract":null,"internal_url":"https://www.academia.edu/92504179/Memory_harvesting_VMs_in_cloud_platforms","translated_internal_url":"","created_at":"2022-12-09T09:53:37.035-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":95496086,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496086/thumbnails/1.jpg","file_name":"3503222.pdf","download_url":"https://www.academia.edu/attachments/95496086/download_file","bulk_download_file_name":"Memory_harvesting_VMs_in_cloud_platforms.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496086/3503222-libre.pdf?1670611753=\u0026response-content-disposition=attachment%3B+filename%3DMemory_harvesting_VMs_in_cloud_platforms.pdf\u0026Expires=1743681117\u0026Signature=aFmj6vgPJHPA1hj0~AEUFE8U-jmR5qfiqp-TzEFgcBK0mk~YKGgntc9H6YzLtsCbHK5R~TVch-NSKIHX2Jg6pz3JI~xP9f-qntSvaSs77LiYiI4khMlCH99Pv1gSSSKdatPBt6hJSLWiPrEVypbrKPTOmI8QEKpBvYTCX0vn2hhNepRcWBA6WhOPwgmvAlXL5c3RqaVOwXdNkdmJnyRTU0WkJjtKkPRxFHe37StbnKV3LtBDH1~bV9U7meTjlnDWeSF-~aNBEQw79cndjKCHsnQ6NfKSE47OmJ-n3XusLxIVCxIGXRwU6w0pJ8rj94PcsQn-dCuYgJt49Cmf1ONmWg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Memory_harvesting_VMs_in_cloud_platforms","translated_slug":"","page_count":12,"language":"en","content_type":"Work","summary":"Cloud platforms monetize their spare capacity by renting \"Spot\" virtual machines (VMs) that can be evicted in favor of higher-priority VMs. Recent work has shown that resource-harvesting VMs are more effective at exploiting spare capacity than Spot VMs, while also reducing the number of evictions. However, the prior work focused on harvesting CPU cores while keeping memory size fixed. This wastes a substantial monetization opportunity and may even limit the ability of harvesting VMs to leverage spare cores. Thus, in this paper, we explore memory harvesting and its challenges in real cloud platforms, namely its impact on VM creation time, NUMA spanning, and page fragmentation. We start by characterizing the amount and dynamics of the spare memory in Azure. We then design and implement memory-harvesting VMs (MHVMs), introducing new techniques for memory buffering, batching, and pre-reclamation. To demonstrate the use of MHVMs, we also extend a popular cluster scheduling framework (Hadoop) and a FaaS platform to adapt to them. Our main results show that (1) there is plenty of scope for memory harvesting in real platforms; (2) MHVMs are effective at mitigating the negative impacts of harvesting; and (3) our extensions of Hadoop and FaaS successfully hide the MHVMs' varying memory size from the users' data-processing jobs and functions. We conclude that memory harvesting has great potential for practical deployment and users can save up to 93% of their costs when running workloads on MHVMs. CCS CONCEPTS • Computer systems organization → Cloud computing.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":95496086,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496086/thumbnails/1.jpg","file_name":"3503222.pdf","download_url":"https://www.academia.edu/attachments/95496086/download_file","bulk_download_file_name":"Memory_harvesting_VMs_in_cloud_platforms.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496086/3503222-libre.pdf?1670611753=\u0026response-content-disposition=attachment%3B+filename%3DMemory_harvesting_VMs_in_cloud_platforms.pdf\u0026Expires=1743681117\u0026Signature=aFmj6vgPJHPA1hj0~AEUFE8U-jmR5qfiqp-TzEFgcBK0mk~YKGgntc9H6YzLtsCbHK5R~TVch-NSKIHX2Jg6pz3JI~xP9f-qntSvaSs77LiYiI4khMlCH99Pv1gSSSKdatPBt6hJSLWiPrEVypbrKPTOmI8QEKpBvYTCX0vn2hhNepRcWBA6WhOPwgmvAlXL5c3RqaVOwXdNkdmJnyRTU0WkJjtKkPRxFHe37StbnKV3LtBDH1~bV9U7meTjlnDWeSF-~aNBEQw79cndjKCHsnQ6NfKSE47OmJ-n3XusLxIVCxIGXRwU6w0pJ8rj94PcsQn-dCuYgJt49Cmf1ONmWg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":95496085,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/95496085/thumbnails/1.jpg","file_name":"3503222.pdf","download_url":"https://www.academia.edu/attachments/95496085/download_file","bulk_download_file_name":"Memory_harvesting_VMs_in_cloud_platforms.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/95496085/3503222-libre.pdf?1670611765=\u0026response-content-disposition=attachment%3B+filename%3DMemory_harvesting_VMs_in_cloud_platforms.pdf\u0026Expires=1743681117\u0026Signature=A5rUIm0pn2fxsi7W4oqIFh67BiI8qo6SkF097hRQyBfXSxhq0j1Q4-Wh4RDsLlB2-KMi6tZL9lNEDfzaOqqcqVe7hORy2nP1g8n3CwWQ6atqN1egrln2syalz7t2i9GD4KqCxSSPlYuObLZ1PNpkdWs1eajdVfsuT~36SKoKZyT002h5tKEFNXGjgse~zxiqRdA2dyi4pES38VTlzvXUn85qUxTIrvBdrC2gR0rDXh-STx4EsDLX6sLKt2QJq-emaIhTX8EXidpB9PpWcHVh0-Q7wRXO6zIZ77a1XqqEI6ZaFeWOa7vmjexGh~K2V2o9IWuErENSLteC~IRDJXYk8Q__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":26860,"name":"Cloud Computing","url":"https://www.academia.edu/Documents/in/Cloud_Computing"}],"urls":[{"id":26834941,"url":"https://dl.acm.org/doi/pdf/10.1145/3503222.3507725"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-92504179-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="82956344"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/82956344/2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CONF_SCIPY_2013_DMTCP_Bringing_Checkpoint_Restart_to_Python"><img alt="Research paper thumbnail of 2 PROC. OF THE 12th PYTHON IN SCIENCE CONF. (SCIPY 2013) DMTCP: Bringing Checkpoint-Restart to Python" class="work-thumbnail" src="https://attachments.academia-assets.com/88480680/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/82956344/2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CONF_SCIPY_2013_DMTCP_Bringing_Checkpoint_Restart_to_Python">2 PROC. OF THE 12th PYTHON IN SCIENCE CONF. (SCIPY 2013) DMTCP: Bringing Checkpoint-Restart to Python</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. ...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. It operates in user-space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython &amp;quot;from the outside &amp;quot; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be re-quested interactively within a Python session, or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process), and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using VNC; (ii) a Fast/Slow technique to use multiple hosts or cores to check one Cython computation in parallel; and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint f...</span></div><div class="wp-workCard_item"><div class="carousel-container carousel-container--sm" id="profile-work-82956344-figures"><div class="prev-slide-container js-prev-button-container"><button aria-label="Previous" class="carousel-navigation-button js-profile-work-82956344-figures-prev"><span class="material-symbols-outlined" style="font-size: 24px" translate="no">arrow_back_ios</span></button></div><div class="slides-container js-slides-container"><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455763/figure-5-to-checkpoint-image-file-each-process-has-its-own"><img alt="to a checkpoint image file. Each process has its own checkpoint image. Prior to checkpoint, each plugin will have copied into user-space memory any kernel state associated with its concerns. Examples of such concerns include network sockets, files, and pseudo-terminals. Once the checkpoint image has been created, the checkpoint thread &quot;un-quiesces&quot; the user threads and they resume executing application code. " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_005.jpg" /></a></figure><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455733/figure-1-fast-cython-with-slow-cpython-checking-nodes"><img alt="Fig. 1: Fast Cython with Slow CPython &quot;checking&quot; nodes. " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_001.jpg" /></a></figure><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455743/figure-2-to-execute-the-undo-command-the-debugging-session"><img alt="To execute the UNDO command, the debugging session is restarted from the checkpoint image, and the debugging com- mands are automatically re-executed from the list excluding the last command. This takes the process back to before the debugger command was issued. " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_002.jpg" /></a></figure><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455749/figure-4-reverse-expression-watchpoint-command-into-series"><img alt="Fig. 4: Reverse Expression Watchpoint. command into a series of commands terminating with step is non-trivial, and an algorithm for that decomposition is presented in [Visan1l1] . " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_003.jpg" /></a></figure><figure class="figure-slide-container"><a href="https://www.academia.edu/figures/40455758/figure-5-architecture-of-dmtcp"><img alt="Fig. 5: Architecture of DMTCP. " class="figure-slide-image" src="https://figures.academia-assets.com/88480680/figure_004.jpg" /></a></figure></div><div class="next-slide-container js-next-button-container"><button aria-label="Next" class="carousel-navigation-button js-profile-work-82956344-figures-next"><span class="material-symbols-outlined" style="font-size: 24px" translate="no">arrow_forward_ios</span></button></div></div></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="d98fb89f03b46ab4e5d41dbf21d5985a" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:88480680,&quot;asset_id&quot;:82956344,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/88480680/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="82956344"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="82956344"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 82956344; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=82956344]").text(description); $(".js-view-count[data-work-id=82956344]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 82956344; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='82956344']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "d98fb89f03b46ab4e5d41dbf21d5985a" } } $('.js-work-strip[data-work-id=82956344]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":82956344,"title":"2 PROC. OF THE 12th PYTHON IN SCIENCE CONF. (SCIPY 2013) DMTCP: Bringing Checkpoint-Restart to Python","translated_title":"","metadata":{"abstract":"Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. It operates in user-space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026quot;from the outside \u0026quot; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be re-quested interactively within a Python session, or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process), and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using VNC; (ii) a Fast/Slow technique to use multiple hosts or cores to check one Cython computation in parallel; and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint f...","publication_date":{"day":null,"month":null,"year":2016,"errors":{}}},"translated_abstract":"Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. It operates in user-space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026quot;from the outside \u0026quot; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be re-quested interactively within a Python session, or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process), and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using VNC; (ii) a Fast/Slow technique to use multiple hosts or cores to check one Cython computation in parallel; and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint f...","internal_url":"https://www.academia.edu/82956344/2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CONF_SCIPY_2013_DMTCP_Bringing_Checkpoint_Restart_to_Python","translated_internal_url":"","created_at":"2022-07-11T08:22:33.099-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":88480680,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/88480680/thumbnails/1.jpg","file_name":"arya.pdf","download_url":"https://www.academia.edu/attachments/88480680/download_file","bulk_download_file_name":"2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CON.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/88480680/arya-libre.pdf?1657584648=\u0026response-content-disposition=attachment%3B+filename%3D2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CON.pdf\u0026Expires=1743681117\u0026Signature=gMJ-thrxMQVjbyyVpIkLnhQJ8T2BdKDFqQUctIqQAvlq8AGua99mpDN12K9aZG4rNIon340pWFXF9IbXNl2v6j20oW1vbQ4McHPSdCXLGq8j39KgtZrDFWyMdPJUKTyxS2KNpcHN8JMIOPfugPyZe5TPDFM9smKaUvtvEDOMrkxU2R9JsWZbBbu46YmeiDVbYpRIh~hF8b976cVhmlcg9u4-59Yylk2igSG~36wSOxu5w5FrY40TNPrIJ4CXa5KXanMik4vD44y3jRqlwQqO~Yor010MRoB0AlgV-0S6RlVLQMXGZrhwkLZvmAh7TGWz9DW4GLtOU6Ypzk8KaUWfrA__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CONF_SCIPY_2013_DMTCP_Bringing_Checkpoint_Restart_to_Python","translated_slug":"","page_count":6,"language":"en","content_type":"Work","summary":"Abstract—DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint-restart package. It operates in user-space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026quot;from the outside \u0026quot; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be re-quested interactively within a Python session, or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process), and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using VNC; (ii) a Fast/Slow technique to use multiple hosts or cores to check one Cython computation in parallel; and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint f...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":88480680,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/88480680/thumbnails/1.jpg","file_name":"arya.pdf","download_url":"https://www.academia.edu/attachments/88480680/download_file","bulk_download_file_name":"2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CON.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/88480680/arya-libre.pdf?1657584648=\u0026response-content-disposition=attachment%3B+filename%3D2_PROC_OF_THE_12th_PYTHON_IN_SCIENCE_CON.pdf\u0026Expires=1743681117\u0026Signature=gMJ-thrxMQVjbyyVpIkLnhQJ8T2BdKDFqQUctIqQAvlq8AGua99mpDN12K9aZG4rNIon340pWFXF9IbXNl2v6j20oW1vbQ4McHPSdCXLGq8j39KgtZrDFWyMdPJUKTyxS2KNpcHN8JMIOPfugPyZe5TPDFM9smKaUvtvEDOMrkxU2R9JsWZbBbu46YmeiDVbYpRIh~hF8b976cVhmlcg9u4-59Yylk2igSG~36wSOxu5w5FrY40TNPrIJ4CXa5KXanMik4vD44y3jRqlwQqO~Yor010MRoB0AlgV-0S6RlVLQMXGZrhwkLZvmAh7TGWz9DW4GLtOU6Ypzk8KaUWfrA__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":1145043,"name":"Ipython","url":"https://www.academia.edu/Documents/in/Ipython"}],"urls":[{"id":22083991,"url":"http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.897.6773\u0026rep=rep1\u0026type=pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (true) { Aedu.setUpFigureCarousel('profile-work-82956344-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="78073660"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/78073660/System_Level_Scalable_Checkpoint_Restart_for_Petascale_Computing"><img alt="Research paper thumbnail of System-Level Scalable Checkpoint-Restart for Petascale Computing" class="work-thumbnail" src="https://attachments.academia-assets.com/85247513/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/78073660/System_Level_Scalable_Checkpoint_Restart_for_Petascale_Computing">System-Level Scalable Checkpoint-Restart for Petascale Computing</a></div><div class="wp-workCard_item"><span>2016 IEEE 22nd International Conference on Parallel and Distributed Systems (ICPADS)</span><span>, 2016</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Fault tolerance for the upcoming exascale generation has long been an area of active research. On...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="35d201cc48af1fa7bfe1fac603ba450c" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:85247513,&quot;asset_id&quot;:78073660,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/85247513/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="78073660"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="78073660"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 78073660; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=78073660]").text(description); $(".js-view-count[data-work-id=78073660]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 78073660; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='78073660']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "35d201cc48af1fa7bfe1fac603ba450c" } } $('.js-work-strip[data-work-id=78073660]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":78073660,"title":"System-Level Scalable Checkpoint-Restart for Petascale Computing","translated_title":"","metadata":{"publisher":"IEEE","grobid_abstract":"Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.","publication_date":{"day":null,"month":null,"year":2016,"errors":{}},"publication_name":"2016 IEEE 22nd International Conference on Parallel and Distributed Systems (ICPADS)","grobid_abstract_attachment_id":85247513},"translated_abstract":null,"internal_url":"https://www.academia.edu/78073660/System_Level_Scalable_Checkpoint_Restart_for_Petascale_Computing","translated_internal_url":"","created_at":"2022-04-30T14:18:38.786-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":85247513,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247513/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/85247513/download_file","bulk_download_file_name":"System_Level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247513/1607-libre.pdf?1651354198=\u0026response-content-disposition=attachment%3B+filename%3DSystem_Level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681117\u0026Signature=WD91O8CN-hjkpgvAYOAycMpMKE0p5YhKu0vm53rd~LSMpECPWI~PUvgOEinCm49UXr57ws6ndeCvM~wzwR2IOPuNwYkIBhde6ZhvR9ApG-3U3sXipExrSF-~dbHEWyhnp7CIk6HwJE2mB4U-fdg9GowtVbnRObcFU~k3i37twtQLgarl2y4MEzGPcx9Y1xnVutgnCWnAEFkU7oaa87li72LZjCSh7G1ORe14PsLZlOQqHuHDOj73gIDaQrfbhHhwrR6wn9P~ubu-bJIOyasYIT2QfXV1ycGnc6iv1~w97jp3CCnXLFq0M9bFeDuOw9RRn19-oEHWjnBtKTwdtsliZg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"System_Level_Scalable_Checkpoint_Restart_for_Petascale_Computing","translated_slug":"","page_count":18,"language":"en","content_type":"Work","summary":"Fault tolerance for the upcoming exascale generation has long been an area of active research. One of the components of a fault tolerance strategy is checkpointing. Petascale-level checkpointing is demonstrated through a new mechanism for virtualization of the InfiniBand UD (unreliable datagram) mode, and for updating the remote address on each UD-based send, due to lack of a fixed peer. Note that Infini-Band UD is required to support modern MPI implementations. An extrapolation from the current results to future SSD-based storage systems provides evidence that the current approach will remain practical in the exascale generation. This transparent checkpointing approach is evaluated using a framework of the DMTCP checkpointing package. Results are shown for HPCG (linear algebra), NAMD (molecular dynamics), and the NAS NPB benchmarks. In tests up to 32,752 MPI processes on 32,752 CPU cores, checkpointing of a computation with a 38 TB memory footprint in 11 minutes is demonstrated. Runtime overhead is reduced to less than 1%. The approach is also evaluated across three widely used MPI implementations.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":85247513,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247513/thumbnails/1.jpg","file_name":"1607.pdf","download_url":"https://www.academia.edu/attachments/85247513/download_file","bulk_download_file_name":"System_Level_Scalable_Checkpoint_Restart.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247513/1607-libre.pdf?1651354198=\u0026response-content-disposition=attachment%3B+filename%3DSystem_Level_Scalable_Checkpoint_Restart.pdf\u0026Expires=1743681117\u0026Signature=WD91O8CN-hjkpgvAYOAycMpMKE0p5YhKu0vm53rd~LSMpECPWI~PUvgOEinCm49UXr57ws6ndeCvM~wzwR2IOPuNwYkIBhde6ZhvR9ApG-3U3sXipExrSF-~dbHEWyhnp7CIk6HwJE2mB4U-fdg9GowtVbnRObcFU~k3i37twtQLgarl2y4MEzGPcx9Y1xnVutgnCWnAEFkU7oaa87li72LZjCSh7G1ORe14PsLZlOQqHuHDOj73gIDaQrfbhHhwrR6wn9P~ubu-bJIOyasYIT2QfXV1ycGnc6iv1~w97jp3CCnXLFq0M9bFeDuOw9RRn19-oEHWjnBtKTwdtsliZg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":111436,"name":"IEEE","url":"https://www.academia.edu/Documents/in/IEEE"}],"urls":[{"id":20075549,"url":"http://xplorestaging.ieee.org/ielx7/7822825/7823715/07823840.pdf?arnumber=7823840"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-78073660-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="78073659"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/78073659/Transition_Watchpoints_Teaching_Old_Debuggers_New_Tricks"><img alt="Research paper thumbnail of Transition Watchpoints: Teaching Old Debuggers New Tricks" class="work-thumbnail" src="https://attachments.academia-assets.com/85247512/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/78073659/Transition_Watchpoints_Teaching_Old_Debuggers_New_Tricks">Transition Watchpoints: Teaching Old Debuggers New Tricks</a></div><div class="wp-workCard_item"><span>The Art, Science, and Engineering of Programming</span><span>, 2017</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Reversible debuggers and process replay have been developed at least since. This vision enables o...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Reversible debuggers and process replay have been developed at least since. This vision enables one to execute backwards in time under a debugger. Two important problems in practice are that, first, current reversible debuggers are slow when reversing over long time periods, and, second, after building one reversible debugger, it is difficult to transfer that achievement to a new programming environment. The user observes a bug when arriving at an error. Searching backwards for the corresponding fault may require many reverse steps. Ultimately, the user prefers to write an expression that will transition to false upon arriving at the fault. The solution is an expression-transition watchpoint facility based on top of snapshots and record/replay. Expression-transition watchpoints are implemented as binary search through the timeline of a program execution, while using the snapshots as landmarks within that timeline. This allows for debugging of subtle bugs that appear only after minutes or more of program execution. When a bug occurs within seconds of program startup, repeated debugging sessions suffice. Reversible debugging is preferred for bugs seen only after minutes. This architecture allows for an efficient and easy-to-write snapshot-based reversible debugger on top of a conventional debugger. The validity of this approach was tested by developing four personalities (for GDB, MATLAB, Perl, and Python), with each personality typically requiring just lines of code.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="152f7f157eddb106f82970ed95fdd4f1" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:85247512,&quot;asset_id&quot;:78073659,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/85247512/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="78073659"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="78073659"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 78073659; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=78073659]").text(description); $(".js-view-count[data-work-id=78073659]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 78073659; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='78073659']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "152f7f157eddb106f82970ed95fdd4f1" } } $('.js-work-strip[data-work-id=78073659]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":78073659,"title":"Transition Watchpoints: Teaching Old Debuggers New Tricks","translated_title":"","metadata":{"publisher":"Aspect-Oriented Software Association (AOSA)","ai_title_tag":"Efficient Reversible Debugging with Expression-Transition Watchpoints","grobid_abstract":"Reversible debuggers and process replay have been developed at least since. This vision enables one to execute backwards in time under a debugger. Two important problems in practice are that, first, current reversible debuggers are slow when reversing over long time periods, and, second, after building one reversible debugger, it is difficult to transfer that achievement to a new programming environment. The user observes a bug when arriving at an error. Searching backwards for the corresponding fault may require many reverse steps. Ultimately, the user prefers to write an expression that will transition to false upon arriving at the fault. The solution is an expression-transition watchpoint facility based on top of snapshots and record/replay. Expression-transition watchpoints are implemented as binary search through the timeline of a program execution, while using the snapshots as landmarks within that timeline. This allows for debugging of subtle bugs that appear only after minutes or more of program execution. When a bug occurs within seconds of program startup, repeated debugging sessions suffice. Reversible debugging is preferred for bugs seen only after minutes. This architecture allows for an efficient and easy-to-write snapshot-based reversible debugger on top of a conventional debugger. The validity of this approach was tested by developing four personalities (for GDB, MATLAB, Perl, and Python), with each personality typically requiring just lines of code.","publication_date":{"day":null,"month":null,"year":2017,"errors":{}},"publication_name":"The Art, Science, and Engineering of Programming","grobid_abstract_attachment_id":85247512},"translated_abstract":null,"internal_url":"https://www.academia.edu/78073659/Transition_Watchpoints_Teaching_Old_Debuggers_New_Tricks","translated_internal_url":"","created_at":"2022-04-30T14:18:38.628-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":85247512,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247512/thumbnails/1.jpg","file_name":"1703.pdf","download_url":"https://www.academia.edu/attachments/85247512/download_file","bulk_download_file_name":"Transition_Watchpoints_Teaching_Old_Debu.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247512/1703-libre.pdf?1651354205=\u0026response-content-disposition=attachment%3B+filename%3DTransition_Watchpoints_Teaching_Old_Debu.pdf\u0026Expires=1743681117\u0026Signature=G4n3fIejtEjJigm90O2TlD0lHllfQQrl~z41r8ckokiyRQvqehYiacpKPK9HNXmLXN~DkfeVWyDMFIKpLD34fNCkB4MdQsvOhknnYah0~~Ui--~i8hjla36Xs4YYDX5WWSSQKNzQQ4CFgzFqbW09sPApvWqhHQ-5cHRaJGGH3twJIjO7ZoEekBsGFbhDvZf8Thn7Y~v-nW54VXjeKxz075XEY~ODl19XKhOaFDbl82K~reU4QIfSszkGyUhhVf2pNcgk6xcaLXyiLsA-R1utx2wBwCPwWsmtL1VoJNg1qQYBvLJR4aujGjV9mq4~d7vLecRyKnu9ZN36-vrvJaH2bQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Transition_Watchpoints_Teaching_Old_Debuggers_New_Tricks","translated_slug":"","page_count":28,"language":"en","content_type":"Work","summary":"Reversible debuggers and process replay have been developed at least since. This vision enables one to execute backwards in time under a debugger. Two important problems in practice are that, first, current reversible debuggers are slow when reversing over long time periods, and, second, after building one reversible debugger, it is difficult to transfer that achievement to a new programming environment. The user observes a bug when arriving at an error. Searching backwards for the corresponding fault may require many reverse steps. Ultimately, the user prefers to write an expression that will transition to false upon arriving at the fault. The solution is an expression-transition watchpoint facility based on top of snapshots and record/replay. Expression-transition watchpoints are implemented as binary search through the timeline of a program execution, while using the snapshots as landmarks within that timeline. This allows for debugging of subtle bugs that appear only after minutes or more of program execution. When a bug occurs within seconds of program startup, repeated debugging sessions suffice. Reversible debugging is preferred for bugs seen only after minutes. This architecture allows for an efficient and easy-to-write snapshot-based reversible debugger on top of a conventional debugger. The validity of this approach was tested by developing four personalities (for GDB, MATLAB, Perl, and Python), with each personality typically requiring just lines of code.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":85247512,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247512/thumbnails/1.jpg","file_name":"1703.pdf","download_url":"https://www.academia.edu/attachments/85247512/download_file","bulk_download_file_name":"Transition_Watchpoints_Teaching_Old_Debu.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247512/1703-libre.pdf?1651354205=\u0026response-content-disposition=attachment%3B+filename%3DTransition_Watchpoints_Teaching_Old_Debu.pdf\u0026Expires=1743681117\u0026Signature=G4n3fIejtEjJigm90O2TlD0lHllfQQrl~z41r8ckokiyRQvqehYiacpKPK9HNXmLXN~DkfeVWyDMFIKpLD34fNCkB4MdQsvOhknnYah0~~Ui--~i8hjla36Xs4YYDX5WWSSQKNzQQ4CFgzFqbW09sPApvWqhHQ-5cHRaJGGH3twJIjO7ZoEekBsGFbhDvZf8Thn7Y~v-nW54VXjeKxz075XEY~ODl19XKhOaFDbl82K~reU4QIfSszkGyUhhVf2pNcgk6xcaLXyiLsA-R1utx2wBwCPwWsmtL1VoJNg1qQYBvLJR4aujGjV9mq4~d7vLecRyKnu9ZN36-vrvJaH2bQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-78073659-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="78073646"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/78073646/Towards_Fault_Tolerant_Energy_Efficient_High_Performance_Computing_in_the_Cloud"><img alt="Research paper thumbnail of Towards Fault-Tolerant Energy-Efficient High Performance Computing in the Cloud" class="work-thumbnail" src="https://attachments.academia-assets.com/85247494/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/78073646/Towards_Fault_Tolerant_Energy_Efficient_High_Performance_Computing_in_the_Cloud">Towards Fault-Tolerant Energy-Efficient High Performance Computing in the Cloud</a></div><div class="wp-workCard_item"><span>2012 IEEE International Conference on Cluster Computing</span><span>, 2012</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">In cluster computing, power and cooling represent a significant cost compared to the hardware its...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">In cluster computing, power and cooling represent a significant cost compared to the hardware itself. This is of special concern in the cloud, which provides access to large numbers of computers. We examine the use of ARM-based clusters for low-power, high performance computing. This work examines two likely use-modes: (i) a standard dedicated cluster; and (ii) a cluster of pre-configured virtual machines in the cloud. A 40-node department-level cluster based on an ARM Cortex-A9 is compared against a similar cluster based on an Intel Core 2 Duo, in contrast to a recent similar study on just a 4-node cluster. For the NAS benchmarks on 32node clusters, ARM was found to have a power efficiency ranging from 1.3 to 6.2 times greater than that of Intel. This is despite Intel&#39;s approximately five times greater performance. The particular efficiency ratio depends primarily on the size of the working set relative to L2 cache. In addition to energyefficient computing, this study also emphasizes fault tolerance: an important ingredient in high performance computing. It relies on two recent extensions to the DMTCP checkpointrestart package. DMTCP was extended (i) to support ARM CPUs, and (ii) to support checkpointing of the Qemu virtual machine in user-mode. DMTCP is used both to checkpoint native distributed applications, and to checkpoint a network of virtual machines. This latter case demonstrates the ability to deploy pre-configured software in virtual machines hosted in the cloud, and further to migrate cluster computation between hosts in the cloud.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="44075ce150cb8622a8967578e423cc3d" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:85247494,&quot;asset_id&quot;:78073646,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/85247494/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="78073646"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="78073646"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 78073646; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=78073646]").text(description); $(".js-view-count[data-work-id=78073646]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 78073646; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='78073646']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "44075ce150cb8622a8967578e423cc3d" } } $('.js-work-strip[data-work-id=78073646]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":78073646,"title":"Towards Fault-Tolerant Energy-Efficient High Performance Computing in the Cloud","translated_title":"","metadata":{"grobid_abstract":"In cluster computing, power and cooling represent a significant cost compared to the hardware itself. This is of special concern in the cloud, which provides access to large numbers of computers. We examine the use of ARM-based clusters for low-power, high performance computing. This work examines two likely use-modes: (i) a standard dedicated cluster; and (ii) a cluster of pre-configured virtual machines in the cloud. A 40-node department-level cluster based on an ARM Cortex-A9 is compared against a similar cluster based on an Intel Core 2 Duo, in contrast to a recent similar study on just a 4-node cluster. For the NAS benchmarks on 32node clusters, ARM was found to have a power efficiency ranging from 1.3 to 6.2 times greater than that of Intel. This is despite Intel's approximately five times greater performance. The particular efficiency ratio depends primarily on the size of the working set relative to L2 cache. In addition to energyefficient computing, this study also emphasizes fault tolerance: an important ingredient in high performance computing. It relies on two recent extensions to the DMTCP checkpointrestart package. DMTCP was extended (i) to support ARM CPUs, and (ii) to support checkpointing of the Qemu virtual machine in user-mode. DMTCP is used both to checkpoint native distributed applications, and to checkpoint a network of virtual machines. This latter case demonstrates the ability to deploy pre-configured software in virtual machines hosted in the cloud, and further to migrate cluster computation between hosts in the cloud.","publication_date":{"day":null,"month":null,"year":2012,"errors":{}},"publication_name":"2012 IEEE International Conference on Cluster Computing","grobid_abstract_attachment_id":85247494},"translated_abstract":null,"internal_url":"https://www.academia.edu/78073646/Towards_Fault_Tolerant_Energy_Efficient_High_Performance_Computing_in_the_Cloud","translated_internal_url":"","created_at":"2022-04-30T14:17:53.894-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":85247494,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247494/thumbnails/1.jpg","file_name":"cluster12b.pdf","download_url":"https://www.academia.edu/attachments/85247494/download_file","bulk_download_file_name":"Towards_Fault_Tolerant_Energy_Efficient.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247494/cluster12b-libre.pdf?1651354197=\u0026response-content-disposition=attachment%3B+filename%3DTowards_Fault_Tolerant_Energy_Efficient.pdf\u0026Expires=1743681117\u0026Signature=MRrn2b9CP9SXUXzUcnnF7Xj70WdT4cMdVERr6eILsNIH0YF0uelPMS2cK3BWUPucxd8apwfqWXPbBSNJVGinjeg6X6zVO4DRJpcRZ2ugCmokj2-xMTcxzQdolq2suQFOuss-uAotXLYD3IjxZFtWvYuejZRApwgQ4KFsYsEYfEgoL2Fp3RZO0JvugFJKgkir3cgwuBLgWGEZi6nOctHe7X3toUOydJSsP1A8gPogiXAhKH~W8lVzwEGmgbHBzdc00nWbsKVaVfvhYPt4gP-T8kF-fZmDU9ajI6k6DxrRHJKGJ1hQW-BJ5JFCY7aDjA1remlafB0cQIYnc8l~ens9MA__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Towards_Fault_Tolerant_Energy_Efficient_High_Performance_Computing_in_the_Cloud","translated_slug":"","page_count":5,"language":"en","content_type":"Work","summary":"In cluster computing, power and cooling represent a significant cost compared to the hardware itself. This is of special concern in the cloud, which provides access to large numbers of computers. We examine the use of ARM-based clusters for low-power, high performance computing. This work examines two likely use-modes: (i) a standard dedicated cluster; and (ii) a cluster of pre-configured virtual machines in the cloud. A 40-node department-level cluster based on an ARM Cortex-A9 is compared against a similar cluster based on an Intel Core 2 Duo, in contrast to a recent similar study on just a 4-node cluster. For the NAS benchmarks on 32node clusters, ARM was found to have a power efficiency ranging from 1.3 to 6.2 times greater than that of Intel. This is despite Intel's approximately five times greater performance. The particular efficiency ratio depends primarily on the size of the working set relative to L2 cache. In addition to energyefficient computing, this study also emphasizes fault tolerance: an important ingredient in high performance computing. It relies on two recent extensions to the DMTCP checkpointrestart package. DMTCP was extended (i) to support ARM CPUs, and (ii) to support checkpointing of the Qemu virtual machine in user-mode. DMTCP is used both to checkpoint native distributed applications, and to checkpoint a network of virtual machines. This latter case demonstrates the ability to deploy pre-configured software in virtual machines hosted in the cloud, and further to migrate cluster computation between hosts in the cloud.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":85247494,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/85247494/thumbnails/1.jpg","file_name":"cluster12b.pdf","download_url":"https://www.academia.edu/attachments/85247494/download_file","bulk_download_file_name":"Towards_Fault_Tolerant_Energy_Efficient.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/85247494/cluster12b-libre.pdf?1651354197=\u0026response-content-disposition=attachment%3B+filename%3DTowards_Fault_Tolerant_Energy_Efficient.pdf\u0026Expires=1743681117\u0026Signature=MRrn2b9CP9SXUXzUcnnF7Xj70WdT4cMdVERr6eILsNIH0YF0uelPMS2cK3BWUPucxd8apwfqWXPbBSNJVGinjeg6X6zVO4DRJpcRZ2ugCmokj2-xMTcxzQdolq2suQFOuss-uAotXLYD3IjxZFtWvYuejZRApwgQ4KFsYsEYfEgoL2Fp3RZO0JvugFJKgkir3cgwuBLgWGEZi6nOctHe7X3toUOydJSsP1A8gPogiXAhKH~W8lVzwEGmgbHBzdc00nWbsKVaVfvhYPt4gP-T8kF-fZmDU9ajI6k6DxrRHJKGJ1hQW-BJ5JFCY7aDjA1remlafB0cQIYnc8l~ens9MA__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":15835,"name":"Virtual Machines","url":"https://www.academia.edu/Documents/in/Virtual_Machines"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-78073646-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877693"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877693/FReD_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime"><img alt="Research paper thumbnail of FReD: Automated Debugging via Binary Search through a Process Lifetime" class="work-thumbnail" src="https://attachments.academia-assets.com/80443370/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877693/FReD_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime">FReD: Automated Debugging via Binary Search through a Process Lifetime</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Reversible debuggers have been developed at least since 1970. Such a feature is useful when the c...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Reversible debuggers have been developed at least since 1970. Such a feature is useful when the cause of a bug is close in time to the bug manifestation. When the cause is far back in time, one resorts to setting appropriate breakpoints in the debugger and beginning a new debugging session. For these cases when the cause of a bug is far in time from its manifestation, bug diagnosis requires a series of debugging sessions with which to narrow down the cause of the bug. For such &amp;quot;difficult&amp;quot; bugs, this work presents an automated tool to search through the process lifetime and locate the cause. As an example, the bug could be related to a program invariant failing. A binary search through the process lifetime suffices, since the invariant expression is true at the beginning of the program execution, and false when the bug is encountered. An algorithm for such a binary search is presented within the FReD (Fast Reversible Debugger) software. It is based on the ability to checkpo...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="2917e625a4e7940aa6bdff8ab186f63f" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443370,&quot;asset_id&quot;:70877693,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443370/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877693"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877693"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877693; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877693]").text(description); $(".js-view-count[data-work-id=70877693]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877693; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877693']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "2917e625a4e7940aa6bdff8ab186f63f" } } $('.js-work-strip[data-work-id=70877693]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877693,"title":"FReD: Automated Debugging via Binary Search through a Process Lifetime","translated_title":"","metadata":{"abstract":"Reversible debuggers have been developed at least since 1970. Such a feature is useful when the cause of a bug is close in time to the bug manifestation. When the cause is far back in time, one resorts to setting appropriate breakpoints in the debugger and beginning a new debugging session. For these cases when the cause of a bug is far in time from its manifestation, bug diagnosis requires a series of debugging sessions with which to narrow down the cause of the bug. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool to search through the process lifetime and locate the cause. As an example, the bug could be related to a program invariant failing. A binary search through the process lifetime suffices, since the invariant expression is true at the beginning of the program execution, and false when the bug is encountered. An algorithm for such a binary search is presented within the FReD (Fast Reversible Debugger) software. It is based on the ability to checkpo...","publication_date":{"day":20,"month":12,"year":2012,"errors":{}}},"translated_abstract":"Reversible debuggers have been developed at least since 1970. Such a feature is useful when the cause of a bug is close in time to the bug manifestation. When the cause is far back in time, one resorts to setting appropriate breakpoints in the debugger and beginning a new debugging session. For these cases when the cause of a bug is far in time from its manifestation, bug diagnosis requires a series of debugging sessions with which to narrow down the cause of the bug. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool to search through the process lifetime and locate the cause. As an example, the bug could be related to a program invariant failing. A binary search through the process lifetime suffices, since the invariant expression is true at the beginning of the program execution, and false when the bug is encountered. An algorithm for such a binary search is presented within the FReD (Fast Reversible Debugger) software. It is based on the ability to checkpo...","internal_url":"https://www.academia.edu/70877693/FReD_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime","translated_internal_url":"","created_at":"2022-02-07T15:23:25.825-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443370,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443370/thumbnails/1.jpg","file_name":"1212.5204.pdf","download_url":"https://www.academia.edu/attachments/80443370/download_file","bulk_download_file_name":"FReD_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443370/1212.5204-libre.pdf?1644277584=\u0026response-content-disposition=attachment%3B+filename%3DFReD_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=PT1loahvwy4k8UlEJJnLFmo-z0CzYxK~x7G7WGRqm~3~2H1Ip6dCfQPtYkHCeEcThS1DtNwsQ6rOrDV6IJuizqbXfg-X4qnFlNsfeYtCjkP7KgSmNoDdV~xtsUT3FNvwatU8P89M94rgSU-AizehqwkG0Ke4IfdRiiK4ZkiHhqG8S3HK7HMCZU-HUv4R1OIIvYnuSecI9CyTSeYhfgtjEnaSuClSjLGdWNNflyXQNO3ZET27gxaBK6Ehuq2uIom4nwdgvZ7Whjw8JXHq9taPsJw~DYL-rg7zJop0UUJf0oyUhlfv9VpUezY7XkHDDPqDWEpnFGnaQvcar9AzZ0ulrw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"FReD_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime","translated_slug":"","page_count":21,"language":"en","content_type":"Work","summary":"Reversible debuggers have been developed at least since 1970. Such a feature is useful when the cause of a bug is close in time to the bug manifestation. When the cause is far back in time, one resorts to setting appropriate breakpoints in the debugger and beginning a new debugging session. For these cases when the cause of a bug is far in time from its manifestation, bug diagnosis requires a series of debugging sessions with which to narrow down the cause of the bug. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool to search through the process lifetime and locate the cause. As an example, the bug could be related to a program invariant failing. A binary search through the process lifetime suffices, since the invariant expression is true at the beginning of the program execution, and false when the bug is encountered. An algorithm for such a binary search is presented within the FReD (Fast Reversible Debugger) software. It is based on the ability to checkpo...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443370,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443370/thumbnails/1.jpg","file_name":"1212.5204.pdf","download_url":"https://www.academia.edu/attachments/80443370/download_file","bulk_download_file_name":"FReD_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443370/1212.5204-libre.pdf?1644277584=\u0026response-content-disposition=attachment%3B+filename%3DFReD_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=PT1loahvwy4k8UlEJJnLFmo-z0CzYxK~x7G7WGRqm~3~2H1Ip6dCfQPtYkHCeEcThS1DtNwsQ6rOrDV6IJuizqbXfg-X4qnFlNsfeYtCjkP7KgSmNoDdV~xtsUT3FNvwatU8P89M94rgSU-AizehqwkG0Ke4IfdRiiK4ZkiHhqG8S3HK7HMCZU-HUv4R1OIIvYnuSecI9CyTSeYhfgtjEnaSuClSjLGdWNNflyXQNO3ZET27gxaBK6Ehuq2uIom4nwdgvZ7Whjw8JXHq9taPsJw~DYL-rg7zJop0UUJf0oyUhlfv9VpUezY7XkHDDPqDWEpnFGnaQvcar9AzZ0ulrw__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"},{"id":80443369,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443369/thumbnails/1.jpg","file_name":"1212.5204.pdf","download_url":"https://www.academia.edu/attachments/80443369/download_file","bulk_download_file_name":"FReD_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443369/1212.5204-libre.pdf?1644277584=\u0026response-content-disposition=attachment%3B+filename%3DFReD_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=brt~LufuX0HSRcLILgScBnimosnr18ozzG-RAVJy2QQEVvIY05rPCoRA7H06yM~582LKeHdDF-4LN2cklO1~N-GRDYBGlf84J64YZvpM07gY2Lnec-z9FbGZLeyKmxQcKNWVtllT0LM15VmzriVO0faNeBOCk03akIPUJTcj-e2jgrJhCavvq2uFJX~lCRSgIQkCaPHr3ZAG2mjuSmDx8bxBIM0fkdYEOKcV2qtRF4CwtHKp1sd3E1iqQo96gU9xeT9fDXHXc9nMUQWyxBBSD1-7SV7~NactxmlgBk2gYRSmr5Chp0ikqjJelijG6Sqh1jXdsMm4NTqGB3pUHSD0Vg__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[],"urls":[{"id":17437732,"url":"https://archive.org/download/arxiv-1212.5204/1212.5204.pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877693-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877692"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877692/DMTCP_Transparent_checkpointing_for_cluster_computations_and_the_desktop"><img alt="Research paper thumbnail of DMTCP: Transparent checkpointing for cluster computations and the desktop" class="work-thumbnail" src="https://attachments.academia-assets.com/80443430/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877692/DMTCP_Transparent_checkpointing_for_cluster_computations_and_the_desktop">DMTCP: Transparent checkpointing for cluster computations and the desktop</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package for distributed applications. Checkpointing and restart is demonstrated for a wide range of over 20 well known applications, including MATLAB, Python, TightVNC, MPICH2, OpenMPI, and runCMS. RunCMS runs as a 680 MB image in memory that includes 540 dynamic libraries, and is used for the CMS experiment of the Large Hadron Collider at CERN. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads; as well as typical desktop applications. On 128 distributed cores (32 nodes), checkpoint and restart times are typically 2 seconds, with negligible run-time overhead. Typical checkpoint times are reduced to 0.2 seconds when using forked checkpointing. Experimental results show that checkpoint time remains nearly constant as the number of nodes increases on a medium-size cluster. DMTCP automatically accounts for fork, exec, ssh, mutexes/semaphor...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="44b1cee6e183759b133d3f811d2c3501" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443430,&quot;asset_id&quot;:70877692,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443430/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877692"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877692"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877692; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877692]").text(description); $(".js-view-count[data-work-id=70877692]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877692; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877692']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "44b1cee6e183759b133d3f811d2c3501" } } $('.js-work-strip[data-work-id=70877692]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877692,"title":"DMTCP: Transparent checkpointing for cluster computations and the desktop","translated_title":"","metadata":{"abstract":"DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package for distributed applications. Checkpointing and restart is demonstrated for a wide range of over 20 well known applications, including MATLAB, Python, TightVNC, MPICH2, OpenMPI, and runCMS. RunCMS runs as a 680 MB image in memory that includes 540 dynamic libraries, and is used for the CMS experiment of the Large Hadron Collider at CERN. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads; as well as typical desktop applications. On 128 distributed cores (32 nodes), checkpoint and restart times are typically 2 seconds, with negligible run-time overhead. Typical checkpoint times are reduced to 0.2 seconds when using forked checkpointing. Experimental results show that checkpoint time remains nearly constant as the number of nodes increases on a medium-size cluster. DMTCP automatically accounts for fork, exec, ssh, mutexes/semaphor...","publication_date":{"day":null,"month":null,"year":2009,"errors":{}}},"translated_abstract":"DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package for distributed applications. Checkpointing and restart is demonstrated for a wide range of over 20 well known applications, including MATLAB, Python, TightVNC, MPICH2, OpenMPI, and runCMS. RunCMS runs as a 680 MB image in memory that includes 540 dynamic libraries, and is used for the CMS experiment of the Large Hadron Collider at CERN. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads; as well as typical desktop applications. On 128 distributed cores (32 nodes), checkpoint and restart times are typically 2 seconds, with negligible run-time overhead. Typical checkpoint times are reduced to 0.2 seconds when using forked checkpointing. Experimental results show that checkpoint time remains nearly constant as the number of nodes increases on a medium-size cluster. DMTCP automatically accounts for fork, exec, ssh, mutexes/semaphor...","internal_url":"https://www.academia.edu/70877692/DMTCP_Transparent_checkpointing_for_cluster_computations_and_the_desktop","translated_internal_url":"","created_at":"2022-02-07T15:23:25.643-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443430,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443430/thumbnails/1.jpg","file_name":"ipdps09.pdf","download_url":"https://www.academia.edu/attachments/80443430/download_file","bulk_download_file_name":"DMTCP_Transparent_checkpointing_for_clus.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443430/ipdps09-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DDMTCP_Transparent_checkpointing_for_clus.pdf\u0026Expires=1743681117\u0026Signature=C03bLppfm9j52RgJ0XnNMSE7sPqLQxUq-GX4FucMgTj98WpnZjoasEHTflsMtKJJ3sB01VJ7~GiYf05ZH782I5FxWh6Y8VSyLZaiF2X4pstB4QkS9rfTOwvSilJffF-pWCUDZuuShqh6QepiJy9BB1GmE75od8UgAn1lQtoCP3KBjRyJ-jMohmxITRKxbDebeem2dl~jwntKfNxRXRBo0gy37egubT30FTycpSO6LwRweLf2l1qD91RyJDoKaveIvLqBoKGVImBteJ1kTLuAh0gxBWQwFxkynmKRZZOUZPkX8OdadkpfwF9OiU7CPYxzL9uUB~CTxJv1co5E~vQK5A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"DMTCP_Transparent_checkpointing_for_cluster_computations_and_the_desktop","translated_slug":"","page_count":12,"language":"en","content_type":"Work","summary":"DMTCP (Distributed MultiThreaded CheckPointing) is a transparent user-level checkpointing package for distributed applications. Checkpointing and restart is demonstrated for a wide range of over 20 well known applications, including MATLAB, Python, TightVNC, MPICH2, OpenMPI, and runCMS. RunCMS runs as a 680 MB image in memory that includes 540 dynamic libraries, and is used for the CMS experiment of the Large Hadron Collider at CERN. DMTCP transparently checkpoints general cluster computations consisting of many nodes, processes, and threads; as well as typical desktop applications. On 128 distributed cores (32 nodes), checkpoint and restart times are typically 2 seconds, with negligible run-time overhead. Typical checkpoint times are reduced to 0.2 seconds when using forked checkpointing. Experimental results show that checkpoint time remains nearly constant as the number of nodes increases on a medium-size cluster. DMTCP automatically accounts for fork, exec, ssh, mutexes/semaphor...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443430,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443430/thumbnails/1.jpg","file_name":"ipdps09.pdf","download_url":"https://www.academia.edu/attachments/80443430/download_file","bulk_download_file_name":"DMTCP_Transparent_checkpointing_for_clus.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443430/ipdps09-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DDMTCP_Transparent_checkpointing_for_clus.pdf\u0026Expires=1743681117\u0026Signature=C03bLppfm9j52RgJ0XnNMSE7sPqLQxUq-GX4FucMgTj98WpnZjoasEHTflsMtKJJ3sB01VJ7~GiYf05ZH782I5FxWh6Y8VSyLZaiF2X4pstB4QkS9rfTOwvSilJffF-pWCUDZuuShqh6QepiJy9BB1GmE75od8UgAn1lQtoCP3KBjRyJ-jMohmxITRKxbDebeem2dl~jwntKfNxRXRBo0gy37egubT30FTycpSO6LwRweLf2l1qD91RyJDoKaveIvLqBoKGVImBteJ1kTLuAh0gxBWQwFxkynmKRZZOUZPkX8OdadkpfwF9OiU7CPYxzL9uUB~CTxJv1co5E~vQK5A__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":5021,"name":"Packaging","url":"https://www.academia.edu/Documents/in/Packaging"},{"id":34740,"name":"Cluster Computing","url":"https://www.academia.edu/Documents/in/Cluster_Computing"},{"id":44244,"name":"OPERATING SYSTEM","url":"https://www.academia.edu/Documents/in/OPERATING_SYSTEM"},{"id":67584,"name":"Large Hadron Collider","url":"https://www.academia.edu/Documents/in/Large_Hadron_Collider"},{"id":80870,"name":"Parallel \u0026 Distributed Computing","url":"https://www.academia.edu/Documents/in/Parallel_and_Distributed_Computing"},{"id":97733,"name":"Shared memory","url":"https://www.academia.edu/Documents/in/Shared_memory"},{"id":191487,"name":"Kernel","url":"https://www.academia.edu/Documents/in/Kernel"}],"urls":[{"id":17437731,"url":"http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.207.7693\u0026rep=rep1\u0026type=pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877692-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877691"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877691/Semi_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime"><img alt="Research paper thumbnail of Semi-Automated Debugging via Binary Search through a Process Lifetime" class="work-thumbnail" src="https://attachments.academia-assets.com/80443431/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877691/Semi_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime">Semi-Automated Debugging via Binary Search through a Process Lifetime</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">A common programmer experience is to execute a long-running computation only to see a bug crash t...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">A common programmer experience is to execute a long-running computation only to see a bug crash the program after hours or days. While it is often easy to capture a &amp;quot;buggy&amp;quot; expression value at the point of the crash, it is less easy to discover the point in the program where the expression became buggy. For such &amp;quot;difficult&amp;quot; bugs, this work presents an automated tool based on binary search through a process lifetime. The tool operates both in singlethreaded and multi-threaded program. The underlying algorithm depends on on checkpoints, deterministic replay, and decomposition of debugging histories. The tool is scalable in the sense that the running time is a small constant factor beyond the standalone running time. Further, it requires only a logarithmic number of probes of the expression value -an advantage when the time to execute the expression is large. The algorithm is demonstrated for such realworld programs as MySQL.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="834a2dcec8422e0d028df02587fad5ea" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443431,&quot;asset_id&quot;:70877691,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443431/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877691"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877691"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877691; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877691]").text(description); $(".js-view-count[data-work-id=70877691]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877691; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877691']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "834a2dcec8422e0d028df02587fad5ea" } } $('.js-work-strip[data-work-id=70877691]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877691,"title":"Semi-Automated Debugging via Binary Search through a Process Lifetime","translated_title":"","metadata":{"abstract":"A common programmer experience is to execute a long-running computation only to see a bug crash the program after hours or days. While it is often easy to capture a \u0026quot;buggy\u0026quot; expression value at the point of the crash, it is less easy to discover the point in the program where the expression became buggy. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool based on binary search through a process lifetime. The tool operates both in singlethreaded and multi-threaded program. The underlying algorithm depends on on checkpoints, deterministic replay, and decomposition of debugging histories. The tool is scalable in the sense that the running time is a small constant factor beyond the standalone running time. Further, it requires only a logarithmic number of probes of the expression value -an advantage when the time to execute the expression is large. The algorithm is demonstrated for such realworld programs as MySQL.","publication_date":{"day":null,"month":null,"year":2020,"errors":{}}},"translated_abstract":"A common programmer experience is to execute a long-running computation only to see a bug crash the program after hours or days. While it is often easy to capture a \u0026quot;buggy\u0026quot; expression value at the point of the crash, it is less easy to discover the point in the program where the expression became buggy. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool based on binary search through a process lifetime. The tool operates both in singlethreaded and multi-threaded program. The underlying algorithm depends on on checkpoints, deterministic replay, and decomposition of debugging histories. The tool is scalable in the sense that the running time is a small constant factor beyond the standalone running time. Further, it requires only a logarithmic number of probes of the expression value -an advantage when the time to execute the expression is large. The algorithm is demonstrated for such realworld programs as MySQL.","internal_url":"https://www.academia.edu/70877691/Semi_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime","translated_internal_url":"","created_at":"2022-02-07T15:23:25.464-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443431,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443431/thumbnails/1.jpg","file_name":"1212.pdf","download_url":"https://www.academia.edu/attachments/80443431/download_file","bulk_download_file_name":"Semi_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443431/1212-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DSemi_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=bUhb7yoiQiMSxeUhfadqiss2eYraiGP3RUyIUIm3I~ao7qbwe-yrJkmJ7Jh7Jcg9vphiekYgB85d9NFQ7p0sVpvVzoK1y7WgKAxetM4AMx2BcsmZY8L6vAYdJ~jQchOe33-XOpKQ~Iywb~3vNCtjGUjvpRthmAidC82Vi-PLiEJCbJOVeAp5J5sLW~3-BBL2ThgmvS6XzMk363GbSzSwBk6HMigRQxXosU6eeu26RFKhTa3tofYwI5KB7QKhIcjkuxzu1EDdfcEC55O-5K0rAESZnI1mA8BJzawURn6gRM3bDQZbwBgjPx1fvwhm0YyGsIGo4a6oZV0aqi4ccPAC3Q__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Semi_Automated_Debugging_via_Binary_Search_through_a_Process_Lifetime","translated_slug":"","page_count":21,"language":"en","content_type":"Work","summary":"A common programmer experience is to execute a long-running computation only to see a bug crash the program after hours or days. While it is often easy to capture a \u0026quot;buggy\u0026quot; expression value at the point of the crash, it is less easy to discover the point in the program where the expression became buggy. For such \u0026quot;difficult\u0026quot; bugs, this work presents an automated tool based on binary search through a process lifetime. The tool operates both in singlethreaded and multi-threaded program. The underlying algorithm depends on on checkpoints, deterministic replay, and decomposition of debugging histories. The tool is scalable in the sense that the running time is a small constant factor beyond the standalone running time. Further, it requires only a logarithmic number of probes of the expression value -an advantage when the time to execute the expression is large. The algorithm is demonstrated for such realworld programs as MySQL.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443431,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443431/thumbnails/1.jpg","file_name":"1212.pdf","download_url":"https://www.academia.edu/attachments/80443431/download_file","bulk_download_file_name":"Semi_Automated_Debugging_via_Binary_Sear.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443431/1212-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DSemi_Automated_Debugging_via_Binary_Sear.pdf\u0026Expires=1743681117\u0026Signature=bUhb7yoiQiMSxeUhfadqiss2eYraiGP3RUyIUIm3I~ao7qbwe-yrJkmJ7Jh7Jcg9vphiekYgB85d9NFQ7p0sVpvVzoK1y7WgKAxetM4AMx2BcsmZY8L6vAYdJ~jQchOe33-XOpKQ~Iywb~3vNCtjGUjvpRthmAidC82Vi-PLiEJCbJOVeAp5J5sLW~3-BBL2ThgmvS6XzMk363GbSzSwBk6HMigRQxXosU6eeu26RFKhTa3tofYwI5KB7QKhIcjkuxzu1EDdfcEC55O-5K0rAESZnI1mA8BJzawURn6gRM3bDQZbwBgjPx1fvwhm0YyGsIGo4a6oZV0aqi4ccPAC3Q__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":8137,"name":"Unix","url":"https://www.academia.edu/Documents/in/Unix"},{"id":43254,"name":"Composition","url":"https://www.academia.edu/Documents/in/Composition"},{"id":99915,"name":"Integration","url":"https://www.academia.edu/Documents/in/Integration"},{"id":2722261,"name":"Metasystem","url":"https://www.academia.edu/Documents/in/Metasystem"},{"id":3193313,"name":"arXiv","url":"https://www.academia.edu/Documents/in/arXiv"}],"urls":[{"id":17437730,"url":"http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1062.8803\u0026rep=rep1\u0026type=pdf"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877691-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877690"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877690/Design_and_Implementation_for_Checkpointing_of_Distributed_Resources_Using_Process_Level_Virtualization"><img alt="Research paper thumbnail of Design and Implementation for Checkpointing of Distributed Resources Using Process-Level Virtualization" class="work-thumbnail" src="https://attachments.academia-assets.com/80443428/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877690/Design_and_Implementation_for_Checkpointing_of_Distributed_Resources_Using_Process_Level_Virtualization">Design and Implementation for Checkpointing of Distributed Resources Using Process-Level Virtualization</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">System-level checkpoint-restart is a critical technology for long-running jobs in high-performanc...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">System-level checkpoint-restart is a critical technology for long-running jobs in high-performance computing. Yet, only two approaches to checkpointing MPI applications continue to survive in wide use today. One approach is to use the kernel module-based BLCR in combination with an MPI checkpoint-restart service particular to the MPI implementation in use. Unfortunately, this lacks support for some important Linux system services such as SysV IPC (e.g., shared memory objects). A second approach has been to use the original 2009 DMTCP implementation (herein referred to as DMTCP-09) for transparent, system-level checkpointing. Unfortunately, DMTCP-09 lacked support for checkpointing many of the necessary features found by MPI in a modern batch environment. These include: ssh, the InfiniBand network, process migration (restarting an MPI application on different cluster nodes), and modified file path prefixes on restart (typically due to a changing current directory, mount points, libra...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="c9810bcec5364be316e8e9cd33603a2a" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443428,&quot;asset_id&quot;:70877690,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443428/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877690"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877690"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877690; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877690]").text(description); $(".js-view-count[data-work-id=70877690]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877690; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877690']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "c9810bcec5364be316e8e9cd33603a2a" } } $('.js-work-strip[data-work-id=70877690]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877690,"title":"Design and Implementation for Checkpointing of Distributed Resources Using Process-Level Virtualization","translated_title":"","metadata":{"abstract":"System-level checkpoint-restart is a critical technology for long-running jobs in high-performance computing. Yet, only two approaches to checkpointing MPI applications continue to survive in wide use today. One approach is to use the kernel module-based BLCR in combination with an MPI checkpoint-restart service particular to the MPI implementation in use. Unfortunately, this lacks support for some important Linux system services such as SysV IPC (e.g., shared memory objects). A second approach has been to use the original 2009 DMTCP implementation (herein referred to as DMTCP-09) for transparent, system-level checkpointing. Unfortunately, DMTCP-09 lacked support for checkpointing many of the necessary features found by MPI in a modern batch environment. These include: ssh, the InfiniBand network, process migration (restarting an MPI application on different cluster nodes), and modified file path prefixes on restart (typically due to a changing current directory, mount points, libra...","publisher":"2016 IEEE International Conference on Cluster Computing (CLUSTER)","publication_date":{"day":null,"month":null,"year":2016,"errors":{}}},"translated_abstract":"System-level checkpoint-restart is a critical technology for long-running jobs in high-performance computing. Yet, only two approaches to checkpointing MPI applications continue to survive in wide use today. One approach is to use the kernel module-based BLCR in combination with an MPI checkpoint-restart service particular to the MPI implementation in use. Unfortunately, this lacks support for some important Linux system services such as SysV IPC (e.g., shared memory objects). A second approach has been to use the original 2009 DMTCP implementation (herein referred to as DMTCP-09) for transparent, system-level checkpointing. Unfortunately, DMTCP-09 lacked support for checkpointing many of the necessary features found by MPI in a modern batch environment. These include: ssh, the InfiniBand network, process migration (restarting an MPI application on different cluster nodes), and modified file path prefixes on restart (typically due to a changing current directory, mount points, libra...","internal_url":"https://www.academia.edu/70877690/Design_and_Implementation_for_Checkpointing_of_Distributed_Resources_Using_Process_Level_Virtualization","translated_internal_url":"","created_at":"2022-02-07T15:23:25.227-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443428,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443428/thumbnails/1.jpg","file_name":"cluster16.pdf","download_url":"https://www.academia.edu/attachments/80443428/download_file","bulk_download_file_name":"Design_and_Implementation_for_Checkpoint.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443428/cluster16-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DDesign_and_Implementation_for_Checkpoint.pdf\u0026Expires=1743681117\u0026Signature=FIArRc0yQAb-ahw50oM7gX-5z0X2Jy4bXL6D-qUq06G4WU8NmSdm-1Us19d2vWvpC5MPvwiYOJT2LJSGFAmKIZfNRnO-Wpg804h1DY0qK-2d~sjI8Gvl2yK0Kemm4USxwjyU~IHevzvgZpcp-3GN7c3M9pZiR-gIqLqo~R3A1GmjN0OIl1-uLYAY4g~x667QsM8sla-460lWaBXhU8EmXh5w91KebeRvSp322-g~XTxj7vY1hQLA7eqoSbUXrMAuwiE0hs98bWACSCPWbI4yvJKGKnDmWREo8kmo5VkInuT2YaR21D7MZg1c58~Z6YKR5XJUcT~vMF8F7DRsGMdAEQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"Design_and_Implementation_for_Checkpointing_of_Distributed_Resources_Using_Process_Level_Virtualization","translated_slug":"","page_count":11,"language":"en","content_type":"Work","summary":"System-level checkpoint-restart is a critical technology for long-running jobs in high-performance computing. Yet, only two approaches to checkpointing MPI applications continue to survive in wide use today. One approach is to use the kernel module-based BLCR in combination with an MPI checkpoint-restart service particular to the MPI implementation in use. Unfortunately, this lacks support for some important Linux system services such as SysV IPC (e.g., shared memory objects). A second approach has been to use the original 2009 DMTCP implementation (herein referred to as DMTCP-09) for transparent, system-level checkpointing. Unfortunately, DMTCP-09 lacked support for checkpointing many of the necessary features found by MPI in a modern batch environment. These include: ssh, the InfiniBand network, process migration (restarting an MPI application on different cluster nodes), and modified file path prefixes on restart (typically due to a changing current directory, mount points, libra...","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443428,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443428/thumbnails/1.jpg","file_name":"cluster16.pdf","download_url":"https://www.academia.edu/attachments/80443428/download_file","bulk_download_file_name":"Design_and_Implementation_for_Checkpoint.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443428/cluster16-libre.pdf?1644277575=\u0026response-content-disposition=attachment%3B+filename%3DDesign_and_Implementation_for_Checkpoint.pdf\u0026Expires=1743681117\u0026Signature=FIArRc0yQAb-ahw50oM7gX-5z0X2Jy4bXL6D-qUq06G4WU8NmSdm-1Us19d2vWvpC5MPvwiYOJT2LJSGFAmKIZfNRnO-Wpg804h1DY0qK-2d~sjI8Gvl2yK0Kemm4USxwjyU~IHevzvgZpcp-3GN7c3M9pZiR-gIqLqo~R3A1GmjN0OIl1-uLYAY4g~x667QsM8sla-460lWaBXhU8EmXh5w91KebeRvSp322-g~XTxj7vY1hQLA7eqoSbUXrMAuwiE0hs98bWACSCPWbI4yvJKGKnDmWREo8kmo5VkInuT2YaR21D7MZg1c58~Z6YKR5XJUcT~vMF8F7DRsGMdAEQ__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[],"urls":[{"id":17437729,"url":"http://doi.ieeecomputersociety.org/10.1109/CLUSTER.2016.55"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877690-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877689"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877689/User_space_process_virtualization_in_the_context_of_checkpoint_restart_and_virtual_machines"><img alt="Research paper thumbnail of User-space process virtualization in the context of checkpoint-restart and virtual machines" class="work-thumbnail" src="https://attachments.academia-assets.com/80443429/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877689/User_space_process_virtualization_in_the_context_of_checkpoint_restart_and_virtual_machines">User-space process virtualization in the context of checkpoint-restart and virtual machines</a></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Checkpoint-Restart is the ability to save a set of running processes to a checkpoint image on dis...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Checkpoint-Restart is the ability to save a set of running processes to a checkpoint image on disk, and to later restart them from the disk. In addition to its traditional use in fault tolerance, recovering from a system failure, it has numerous other uses, such as for application debugging and save/restore of the workspace of an interactive problem-solving environment. Transparent checkpointing operates without modifying the underlying application program, but it implicitly relies on a &quot;Closed World Assumption&quot;-the world (including file system, network, etc.) will look the same upon restart as it did at the time of checkpoint. This is not valid for more complex programs. Until now, checkpoint-restart packages have adopted ad hoc solutions for each case where the environment changes upon restart. This dissertation presents user-space process virtualization to decouple application processes from the external subsystems. A thin virtualization layer is introduced between the application and each external subsystem. It provides the application with a consistent view of the external world and allows for checkpoint-restart to succeed. The ever growing number of external subsystems make it harder to deploy and maintain virtualization layers in a monolithic checkpoint-restart system. To address this, an adaptive plugin based approach is used to implement the virtualization layers that allow the checkpoint-restart system to grow organically. The principle of decoupling the external subsystem through process virtualization is also applied in the context of virtual machines for providing a solution to the long standing double-paging problem. Double-paging occurs when the guest attempts to page out memory that has previously been swapped out by the hypervisor and leads to long delays for the guest as the contents are read back into machine memory only to be written out again. The performance rapidly drops as a result of significant lengthening of the time to complete the guest I/O request.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="655cc617adb2884f1740f0faa11c4c32" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443429,&quot;asset_id&quot;:70877689,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443429/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877689"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877689"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877689; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877689]").text(description); $(".js-view-count[data-work-id=70877689]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877689; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877689']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "655cc617adb2884f1740f0faa11c4c32" } } $('.js-work-strip[data-work-id=70877689]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877689,"title":"User-space process virtualization in the context of checkpoint-restart and virtual machines","translated_title":"","metadata":{"grobid_abstract":"Checkpoint-Restart is the ability to save a set of running processes to a checkpoint image on disk, and to later restart them from the disk. In addition to its traditional use in fault tolerance, recovering from a system failure, it has numerous other uses, such as for application debugging and save/restore of the workspace of an interactive problem-solving environment. Transparent checkpointing operates without modifying the underlying application program, but it implicitly relies on a \"Closed World Assumption\"-the world (including file system, network, etc.) will look the same upon restart as it did at the time of checkpoint. This is not valid for more complex programs. Until now, checkpoint-restart packages have adopted ad hoc solutions for each case where the environment changes upon restart. This dissertation presents user-space process virtualization to decouple application processes from the external subsystems. A thin virtualization layer is introduced between the application and each external subsystem. It provides the application with a consistent view of the external world and allows for checkpoint-restart to succeed. The ever growing number of external subsystems make it harder to deploy and maintain virtualization layers in a monolithic checkpoint-restart system. To address this, an adaptive plugin based approach is used to implement the virtualization layers that allow the checkpoint-restart system to grow organically. The principle of decoupling the external subsystem through process virtualization is also applied in the context of virtual machines for providing a solution to the long standing double-paging problem. Double-paging occurs when the guest attempts to page out memory that has previously been swapped out by the hypervisor and leads to long delays for the guest as the contents are read back into machine memory only to be written out again. The performance rapidly drops as a result of significant lengthening of the time to complete the guest I/O request.","publication_date":{"day":null,"month":null,"year":2014,"errors":{}},"grobid_abstract_attachment_id":80443429},"translated_abstract":null,"internal_url":"https://www.academia.edu/70877689/User_space_process_virtualization_in_the_context_of_checkpoint_restart_and_virtual_machines","translated_internal_url":"","created_at":"2022-02-07T15:23:24.846-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443429,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443429/thumbnails/1.jpg","file_name":"fulltext.pdf","download_url":"https://www.academia.edu/attachments/80443429/download_file","bulk_download_file_name":"User_space_process_virtualization_in_the.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443429/fulltext-libre.pdf?1644277580=\u0026response-content-disposition=attachment%3B+filename%3DUser_space_process_virtualization_in_the.pdf\u0026Expires=1743681117\u0026Signature=RC5uScnK16zpQqQnJ0yMQNXgSMhL85cqnKy4W3~iMOT8Z5d0DY~dmyYys4LqoigEfliVOmeRcPrgVZ9HuuoPUNktQRKBIHZIpc9m87B179xxKz-nzN6ltXNzEPeqqVaN44IDwal10ZU0eT1~f7jcT7z6gqU2nG9GyM0CauFkwcBwEXCOWJ5xOzBw2dSKvPUuyp3pCdWymdL9dKvfjAPOpGsgmDjUP9DsXgGOItBvF2F~OFg3rKVh~z3SBHBYtuDIi8tQLOWiXw2S3Oc8-~OPOBBE5qheVoz2BAkgYtlLyXGjYftVM1qMFY-BBUPUuUVjLsPZ-xJmgoJGidc2wvYDug__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"User_space_process_virtualization_in_the_context_of_checkpoint_restart_and_virtual_machines","translated_slug":"","page_count":200,"language":"en","content_type":"Work","summary":"Checkpoint-Restart is the ability to save a set of running processes to a checkpoint image on disk, and to later restart them from the disk. In addition to its traditional use in fault tolerance, recovering from a system failure, it has numerous other uses, such as for application debugging and save/restore of the workspace of an interactive problem-solving environment. Transparent checkpointing operates without modifying the underlying application program, but it implicitly relies on a \"Closed World Assumption\"-the world (including file system, network, etc.) will look the same upon restart as it did at the time of checkpoint. This is not valid for more complex programs. Until now, checkpoint-restart packages have adopted ad hoc solutions for each case where the environment changes upon restart. This dissertation presents user-space process virtualization to decouple application processes from the external subsystems. A thin virtualization layer is introduced between the application and each external subsystem. It provides the application with a consistent view of the external world and allows for checkpoint-restart to succeed. The ever growing number of external subsystems make it harder to deploy and maintain virtualization layers in a monolithic checkpoint-restart system. To address this, an adaptive plugin based approach is used to implement the virtualization layers that allow the checkpoint-restart system to grow organically. The principle of decoupling the external subsystem through process virtualization is also applied in the context of virtual machines for providing a solution to the long standing double-paging problem. Double-paging occurs when the guest attempts to page out memory that has previously been swapped out by the hypervisor and leads to long delays for the guest as the contents are read back into machine memory only to be written out again. The performance rapidly drops as a result of significant lengthening of the time to complete the guest I/O request.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443429,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443429/thumbnails/1.jpg","file_name":"fulltext.pdf","download_url":"https://www.academia.edu/attachments/80443429/download_file","bulk_download_file_name":"User_space_process_virtualization_in_the.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443429/fulltext-libre.pdf?1644277580=\u0026response-content-disposition=attachment%3B+filename%3DUser_space_process_virtualization_in_the.pdf\u0026Expires=1743681117\u0026Signature=RC5uScnK16zpQqQnJ0yMQNXgSMhL85cqnKy4W3~iMOT8Z5d0DY~dmyYys4LqoigEfliVOmeRcPrgVZ9HuuoPUNktQRKBIHZIpc9m87B179xxKz-nzN6ltXNzEPeqqVaN44IDwal10ZU0eT1~f7jcT7z6gqU2nG9GyM0CauFkwcBwEXCOWJ5xOzBw2dSKvPUuyp3pCdWymdL9dKvfjAPOpGsgmDjUP9DsXgGOItBvF2F~OFg3rKVh~z3SBHBYtuDIi8tQLOWiXw2S3Oc8-~OPOBBE5qheVoz2BAkgYtlLyXGjYftVM1qMFY-BBUPUuUVjLsPZ-xJmgoJGidc2wvYDug__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"},{"id":440,"name":"Distributed Computing","url":"https://www.academia.edu/Documents/in/Distributed_Computing"},{"id":15835,"name":"Virtual Machines","url":"https://www.academia.edu/Documents/in/Virtual_Machines"},{"id":36300,"name":"Fault Tolerance","url":"https://www.academia.edu/Documents/in/Fault_Tolerance"},{"id":327659,"name":"Paging","url":"https://www.academia.edu/Documents/in/Paging"}],"urls":[{"id":17437728,"url":"http://iris.lib.neu.edu/cgi/viewcontent.cgi?article=1042\u0026context=comp_sci_diss"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877689-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877688"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" rel="nofollow" href="https://www.academia.edu/70877688/Detecting_and_Suppressing_Redundant_Input_Output_Operations"><img alt="Research paper thumbnail of Detecting and Suppressing Redundant Input-Output Operations" class="work-thumbnail" src="https://a.academia-assets.com/images/blank-paper.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title">Detecting and Suppressing Redundant Input-Output Operations</div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877688"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877688"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877688; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877688]").text(description); $(".js-view-count[data-work-id=70877688]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877688; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877688']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (false){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "-1" } } $('.js-work-strip[data-work-id=70877688]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877688,"title":"Detecting and Suppressing Redundant Input-Output Operations","translated_title":"","metadata":{"publication_date":{"day":20,"month":12,"year":2012,"errors":{}}},"translated_abstract":null,"internal_url":"https://www.academia.edu/70877688/Detecting_and_Suppressing_Redundant_Input_Output_Operations","translated_internal_url":"","created_at":"2022-02-07T15:23:24.713-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[],"slug":"Detecting_and_Suppressing_Redundant_Input_Output_Operations","translated_slug":"","page_count":null,"language":"en","content_type":"Work","summary":null,"owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[],"research_interests":[],"urls":[{"id":17437727,"url":"http://www.freepatentsonline.com/y2012/0324181.html"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877688-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877687"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" rel="nofollow" href="https://www.academia.edu/70877687/DMTCP_bringing_interactive_checkpoint_restart_to_Python"><img alt="Research paper thumbnail of DMTCP: bringing interactive checkpoint–restart to Python" class="work-thumbnail" src="https://a.academia-assets.com/images/blank-paper.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title">DMTCP: bringing interactive checkpoint–restart to Python</div><div class="wp-workCard_item"><span>Computational Science &amp; Discovery</span><span>, 2015</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operat...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operates in user space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython &amp;#39;from the outside&amp;#39; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be requested interactively within a Python session or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process) and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using virtual network client, (ii) a fast/slow technique to use multiple hosts or cores to check one (Cython Behnel S et al 2011 Comput. Sci. Eng. 13 31–39) computation in parallel, and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint feature for locating the cause of a bug.</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877687"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877687"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877687; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877687]").text(description); $(".js-view-count[data-work-id=70877687]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877687; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877687']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (false){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "-1" } } $('.js-work-strip[data-work-id=70877687]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877687,"title":"DMTCP: bringing interactive checkpoint–restart to Python","translated_title":"","metadata":{"abstract":"DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operates in user space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026#39;from the outside\u0026#39; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be requested interactively within a Python session or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process) and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using virtual network client, (ii) a fast/slow technique to use multiple hosts or cores to check one (Cython Behnel S et al 2011 Comput. Sci. Eng. 13 31–39) computation in parallel, and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint feature for locating the cause of a bug.","publisher":"IOP Publishing","publication_date":{"day":null,"month":null,"year":2015,"errors":{}},"publication_name":"Computational Science \u0026 Discovery"},"translated_abstract":"DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operates in user space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026#39;from the outside\u0026#39; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be requested interactively within a Python session or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process) and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using virtual network client, (ii) a fast/slow technique to use multiple hosts or cores to check one (Cython Behnel S et al 2011 Comput. Sci. Eng. 13 31–39) computation in parallel, and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint feature for locating the cause of a bug.","internal_url":"https://www.academia.edu/70877687/DMTCP_bringing_interactive_checkpoint_restart_to_Python","translated_internal_url":"","created_at":"2022-02-07T15:23:24.614-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[],"slug":"DMTCP_bringing_interactive_checkpoint_restart_to_Python","translated_slug":"","page_count":null,"language":"en","content_type":"Work","summary":"DMTCP (Distributed MultiThreaded CheckPointing) is a mature checkpoint–restart package. It operates in user space without kernel privilege, and adapts to application-specific requirements through plugins. While DMTCP has been able to checkpoint Python and IPython \u0026#39;from the outside\u0026#39; for many years, a Python module has recently been created to support DMTCP. IPython support is included through a new DMTCP plugin. A checkpoint can be requested interactively within a Python session or under the control of a specific Python program. Further, the Python program can execute specific Python code prior to checkpoint, upon resuming (within the original process) and upon restarting (from a checkpoint image). Applications of DMTCP are demonstrated for: (i) Python-based graphics using virtual network client, (ii) a fast/slow technique to use multiple hosts or cores to check one (Cython Behnel S et al 2011 Comput. Sci. Eng. 13 31–39) computation in parallel, and (iii) a reversible debugger, FReD, with a novel reverse-expression watchpoint feature for locating the cause of a bug.","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877687-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="70877571"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" href="https://www.academia.edu/70877571/SmartHarvest"><img alt="Research paper thumbnail of SmartHarvest" class="work-thumbnail" src="https://attachments.academia-assets.com/80443367/thumbnails/1.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title"><a class="js-work-strip-work-link text-gray-darker" data-click-track="profile-work-strip-title" href="https://www.academia.edu/70877571/SmartHarvest">SmartHarvest</a></div><div class="wp-workCard_item"><span>Proceedings of the Sixteenth European Conference on Computer Systems</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">We can increase the efficiency of public cloud datacenters by harvesting allocated but temporaril...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">We can increase the efficiency of public cloud datacenters by harvesting allocated but temporarily idling CPU cores from customer virtual machines (VMs) to run batch or analytics workloads. Even small efficiency gains translate into substantial savings, since provisioning and operating a datacenter costs hundreds of millions of dollars per year. The main challenge is to harvest idle cores with little or no impact on customer VMs, which could be running latency-sensitive services and are essentially black-boxes to the cloud provider. We introduce ElasticVM, a new VM type that can run batch workloads cheaply using mainly harvested cores. We also propose SmartHarvest, a system that dynamically manages the number of cores available to ElasticVMs in each fine-grained time window. SmartHarvest uses online learning to predict the core demand of primary, customer VMs and compute the number of cores that can be safely harvested. Our results show that SmartHarvest can harvest a significant amount of CPU resources without increasing the 99th-percentile tail latency of latency-critical primary workloads by more than 10%. Unlike static harvesting techniques</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><a id="3bab90b5ec720b53f4ff1b52bc8f4126" class="wp-workCard--action" rel="nofollow" data-click-track="profile-work-strip-download" data-download="{&quot;attachment_id&quot;:80443367,&quot;asset_id&quot;:70877571,&quot;asset_type&quot;:&quot;Work&quot;,&quot;button_location&quot;:&quot;profile&quot;}" href="https://www.academia.edu/attachments/80443367/download_file?s=profile"><span><i class="fa fa-arrow-down"></i></span><span>Download</span></a><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="70877571"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="70877571"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 70877571; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=70877571]").text(description); $(".js-view-count[data-work-id=70877571]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 70877571; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='70877571']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (true){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "3bab90b5ec720b53f4ff1b52bc8f4126" } } $('.js-work-strip[data-work-id=70877571]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":70877571,"title":"SmartHarvest","translated_title":"","metadata":{"publisher":"ACM","ai_title_tag":"Smart Harvesting of Idle Cloud CPU Cores","grobid_abstract":"We can increase the efficiency of public cloud datacenters by harvesting allocated but temporarily idling CPU cores from customer virtual machines (VMs) to run batch or analytics workloads. Even small efficiency gains translate into substantial savings, since provisioning and operating a datacenter costs hundreds of millions of dollars per year. The main challenge is to harvest idle cores with little or no impact on customer VMs, which could be running latency-sensitive services and are essentially black-boxes to the cloud provider. We introduce ElasticVM, a new VM type that can run batch workloads cheaply using mainly harvested cores. We also propose SmartHarvest, a system that dynamically manages the number of cores available to ElasticVMs in each fine-grained time window. SmartHarvest uses online learning to predict the core demand of primary, customer VMs and compute the number of cores that can be safely harvested. Our results show that SmartHarvest can harvest a significant amount of CPU resources without increasing the 99th-percentile tail latency of latency-critical primary workloads by more than 10%. Unlike static harvesting techniques","publication_name":"Proceedings of the Sixteenth European Conference on Computer Systems","grobid_abstract_attachment_id":80443367},"translated_abstract":null,"internal_url":"https://www.academia.edu/70877571/SmartHarvest","translated_internal_url":"","created_at":"2022-02-07T15:22:22.514-08:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":46423,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[],"downloadable_attachments":[{"id":80443367,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443367/thumbnails/1.jpg","file_name":"eurosys21.pdf","download_url":"https://www.academia.edu/attachments/80443367/download_file","bulk_download_file_name":"SmartHarvest.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443367/eurosys21-libre.pdf?1644277588=\u0026response-content-disposition=attachment%3B+filename%3DSmartHarvest.pdf\u0026Expires=1743681117\u0026Signature=BWTqa-8mVQM77Ckw3AQk2lnrfBeOjOPbF-tnWf~s7adzI5S3sAxaPrSRRJs9TrNZhb8r9~GxFIE60dIJ1A0~8R93C~vLYrNzRc03ULDPfCOmbOy~FavHVZsI6ESyUAZgL~35ii-1A8NSsmufDcH1Jwn0F4zYGtRESzHkEO5liSv1oUemnQLfAKOZmumvUX8Qd38zHb5bL1swJx2PxBMwjmdWW-ZOg2uqL~rM4z~Vs0s7G6W53~IB7rpauiaj0fVvD31Oxh82wUyDce4oEMbLK9FslDT0sVhEV-bhnzgJc5hEVGp9QctSqWRKVkToaIKtS83AWjyHoErMCrgLHNoL8w__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"slug":"SmartHarvest","translated_slug":"","page_count":16,"language":"en","content_type":"Work","summary":"We can increase the efficiency of public cloud datacenters by harvesting allocated but temporarily idling CPU cores from customer virtual machines (VMs) to run batch or analytics workloads. Even small efficiency gains translate into substantial savings, since provisioning and operating a datacenter costs hundreds of millions of dollars per year. The main challenge is to harvest idle cores with little or no impact on customer VMs, which could be running latency-sensitive services and are essentially black-boxes to the cloud provider. We introduce ElasticVM, a new VM type that can run batch workloads cheaply using mainly harvested cores. We also propose SmartHarvest, a system that dynamically manages the number of cores available to ElasticVMs in each fine-grained time window. SmartHarvest uses online learning to predict the core demand of primary, customer VMs and compute the number of cores that can be safely harvested. Our results show that SmartHarvest can harvest a significant amount of CPU resources without increasing the 99th-percentile tail latency of latency-critical primary workloads by more than 10%. Unlike static harvesting techniques","owner":{"id":46423,"first_name":"Kapil","middle_initials":null,"last_name":"Arya","page_name":"KapilArya2","domain_name":"independent","created_at":"2009-06-02T23:06:29.222-07:00","display_name":"Kapil Arya","url":"https://independent.academia.edu/KapilArya2"},"attachments":[{"id":80443367,"title":"","file_type":"pdf","scribd_thumbnail_url":"https://attachments.academia-assets.com/80443367/thumbnails/1.jpg","file_name":"eurosys21.pdf","download_url":"https://www.academia.edu/attachments/80443367/download_file","bulk_download_file_name":"SmartHarvest.pdf","bulk_download_url":"https://d1wqtxts1xzle7.cloudfront.net/80443367/eurosys21-libre.pdf?1644277588=\u0026response-content-disposition=attachment%3B+filename%3DSmartHarvest.pdf\u0026Expires=1743681117\u0026Signature=BWTqa-8mVQM77Ckw3AQk2lnrfBeOjOPbF-tnWf~s7adzI5S3sAxaPrSRRJs9TrNZhb8r9~GxFIE60dIJ1A0~8R93C~vLYrNzRc03ULDPfCOmbOy~FavHVZsI6ESyUAZgL~35ii-1A8NSsmufDcH1Jwn0F4zYGtRESzHkEO5liSv1oUemnQLfAKOZmumvUX8Qd38zHb5bL1swJx2PxBMwjmdWW-ZOg2uqL~rM4z~Vs0s7G6W53~IB7rpauiaj0fVvD31Oxh82wUyDce4oEMbLK9FslDT0sVhEV-bhnzgJc5hEVGp9QctSqWRKVkToaIKtS83AWjyHoErMCrgLHNoL8w__\u0026Key-Pair-Id=APKAJLOHF5GGSLRBV4ZA"}],"research_interests":[{"id":422,"name":"Computer Science","url":"https://www.academia.edu/Documents/in/Computer_Science"}],"urls":[{"id":17437689,"url":"https://dl.acm.org/doi/pdf/10.1145/3447786.3456225"}]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-70877571-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="26279522"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" rel="nofollow" href="https://www.academia.edu/26279522/Urdb"><img alt="Research paper thumbnail of Urdb" class="work-thumbnail" src="https://a.academia-assets.com/images/blank-paper.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title">Urdb</div><div class="wp-workCard_item wp-workCard--coauthors"><span>by </span><span><a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/TylerDeniston">Tyler Deniston</a>, <a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/KapilArya2">Kapil Arya</a>, and <a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/AnaMariaVisan3">Ana-Maria Visan</a></span></div><div class="wp-workCard_item"><span>Proceedings of the 6th Workshop on Programming Languages and Operating Systems - PLOS &#39;11</span><span>, 2011</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced ba...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced based on checkpoint/re-execute. It adds reversibility to a debugger, while still placing the end user within the familiar environment of their preferred debugger. The URDB software layer currently includes modes that understand the syntax for four debuggers: GDB for C/C++/Java/Fortran, Python (pdb), MATLAB, and Perl (perl</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="26279522"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="26279522"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 26279522; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=26279522]").text(description); $(".js-view-count[data-work-id=26279522]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 26279522; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='26279522']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (false){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "-1" } } $('.js-work-strip[data-work-id=26279522]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":26279522,"title":"Urdb","translated_title":"","metadata":{"abstract":"Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced based on checkpoint/re-execute. It adds reversibility to a debugger, while still placing the end user within the familiar environment of their preferred debugger. The URDB software layer currently includes modes that understand the syntax for four debuggers: GDB for C/C++/Java/Fortran, Python (pdb), MATLAB, and Perl (perl","publication_date":{"day":null,"month":null,"year":2011,"errors":{}},"publication_name":"Proceedings of the 6th Workshop on Programming Languages and Operating Systems - PLOS '11"},"translated_abstract":"Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced based on checkpoint/re-execute. It adds reversibility to a debugger, while still placing the end user within the familiar environment of their preferred debugger. The URDB software layer currently includes modes that understand the syntax for four debuggers: GDB for C/C++/Java/Fortran, Python (pdb), MATLAB, and Perl (perl","internal_url":"https://www.academia.edu/26279522/Urdb","translated_internal_url":"","created_at":"2016-06-18T05:21:08.498-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":50195091,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[{"id":21451873,"work_id":26279522,"tagging_user_id":50195091,"tagged_user_id":46423,"co_author_invite_id":4805756,"email":"k***7@gmail.com","display_order":0,"name":"Kapil Arya","title":"Urdb"},{"id":21451874,"work_id":26279522,"tagging_user_id":50195091,"tagged_user_id":null,"co_author_invite_id":4805757,"email":"t***r@c6rs.com","display_order":4194304,"name":"Tyler Denniston","title":"Urdb"},{"id":21451875,"work_id":26279522,"tagging_user_id":50195091,"tagged_user_id":49938451,"co_author_invite_id":null,"email":"g***e@ccs.neu.edu","affiliation":"Northeastern University","display_order":6291456,"name":"Gene Cooperman","title":"Urdb"},{"id":21451876,"work_id":26279522,"tagging_user_id":50195091,"tagged_user_id":50013738,"co_author_invite_id":null,"email":"a***n@google.com","display_order":7340032,"name":"Ana-Maria Visan","title":"Urdb"}],"downloadable_attachments":[],"slug":"Urdb","translated_slug":"","page_count":null,"language":"en","content_type":"Work","summary":"Reversible debuggers have existed since the early 1970s. A novel approach, URDB, is introduced based on checkpoint/re-execute. It adds reversibility to a debugger, while still placing the end user within the familiar environment of their preferred debugger. The URDB software layer currently includes modes that understand the syntax for four debuggers: GDB for C/C++/Java/Fortran, Python (pdb), MATLAB, and Perl (perl","owner":{"id":50195091,"first_name":"Tyler","middle_initials":null,"last_name":"Deniston","page_name":"TylerDeniston","domain_name":"independent","created_at":"2016-06-18T05:20:57.618-07:00","display_name":"Tyler Deniston","url":"https://independent.academia.edu/TylerDeniston"},"attachments":[],"research_interests":[{"id":70448,"name":"Fortran","url":"https://www.academia.edu/Documents/in/Fortran"}],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-26279522-figures'); } }); </script> <div class="js-work-strip profile--work_container" data-work-id="26098718"><div class="profile--work_thumbnail hidden-xs"><a class="js-work-strip-work-link" data-click-track="profile-work-strip-thumbnail" rel="nofollow" href="https://www.academia.edu/26098718/Temporal_Debugging_Automating_Time_Travel_Debugging_with_URDB"><img alt="Research paper thumbnail of Temporal Debugging: Automating Time Travel Debugging with URDB" class="work-thumbnail" src="https://a.academia-assets.com/images/blank-paper.jpg" /></a></div><div class="wp-workCard wp-workCard_itemContainer"><div class="wp-workCard_item wp-workCard--title">Temporal Debugging: Automating Time Travel Debugging with URDB</div><div class="wp-workCard_item wp-workCard--coauthors"><span>by </span><span><a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/AnaMariaVisan3">Ana-Maria Visan</a> and <a class="" data-click-track="profile-work-strip-authors" href="https://independent.academia.edu/KapilArya2">Kapil Arya</a></span></div><div class="wp-workCard_item"><span>ccs.neu.edu</span></div><div class="wp-workCard_item"><span class="js-work-more-abstract-truncated">... Xin Dong Northeastern University xindong@ccs.neu.edu Kapil Arya Tyler Denniston Praveen S. So...</span><a class="js-work-more-abstract" data-broccoli-component="work_strip.more_abstract" data-click-track="profile-work-strip-more-abstract" href="javascript:;"><span> more </span><span><i class="fa fa-caret-down"></i></span></a><span class="js-work-more-abstract-untruncated hidden">... Xin Dong Northeastern University <a href="mailto:xindong@ccs.neu.edu" rel="nofollow">xindong@ccs.neu.edu</a> Kapil Arya Tyler Denniston Praveen S. Solanki Gene Cooperman Northeastern University {kapil,tyler,psolanki,gene}@ccs.neu.edu Abstract This work addresses two classical problems in debugging. ...</span></div><div class="wp-workCard_item wp-workCard--actions"><span class="work-strip-bookmark-button-container"></span><span class="wp-workCard--action visible-if-viewed-by-owner inline-block" style="display: none;"><span class="js-profile-work-strip-edit-button-wrapper profile-work-strip-edit-button-wrapper" data-work-id="26098718"><a class="js-profile-work-strip-edit-button" tabindex="0"><span><i class="fa fa-pencil"></i></span><span>Edit</span></a></span></span></div><div class="wp-workCard_item wp-workCard--stats"><span><span><span class="js-view-count view-count u-mr2x" data-work-id="26098718"><i class="fa fa-spinner fa-spin"></i></span><script>$(function () { var workId = 26098718; window.Academia.workViewCountsFetcher.queue(workId, function (count) { var description = window.$h.commaizeInt(count) + " " + window.$h.pluralize(count, 'View'); $(".js-view-count[data-work-id=26098718]").text(description); $(".js-view-count[data-work-id=26098718]").attr('title', description).tooltip(); }); });</script></span></span><span><span class="percentile-widget hidden"><span class="u-mr2x work-percentile"></span></span><script>$(function () { var workId = 26098718; window.Academia.workPercentilesFetcher.queue(workId, function (percentileText) { var container = $(".js-work-strip[data-work-id='26098718']"); container.find('.work-percentile').text(percentileText.charAt(0).toUpperCase() + percentileText.slice(1)); container.find('.percentile-widget').show(); container.find('.percentile-widget').removeClass('hidden'); }); });</script></span></div><div id="work-strip-premium-row-container"></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/work_edit-ad038b8c047c1a8d4fa01b402d530ff93c45fee2137a149a4a5398bc8ad67560.js"], function() { // from javascript_helper.rb var dispatcherData = {} if (false){ window.WowProfile.dispatcher = window.WowProfile.dispatcher || _.clone(Backbone.Events); dispatcherData = { dispatcher: window.WowProfile.dispatcher, downloadLinkId: "-1" } } $('.js-work-strip[data-work-id=26098718]').each(function() { if (!$(this).data('initialized')) { new WowProfile.WorkStripView({ el: this, workJSON: {"id":26098718,"title":"Temporal Debugging: Automating Time Travel Debugging with URDB","translated_title":"","metadata":{"abstract":"... Xin Dong Northeastern University xindong@ccs.neu.edu Kapil Arya Tyler Denniston Praveen S. Solanki Gene Cooperman Northeastern University {kapil,tyler,psolanki,gene}@ccs.neu.edu Abstract This work addresses two classical problems in debugging. ...","publication_name":"ccs.neu.edu"},"translated_abstract":"... Xin Dong Northeastern University xindong@ccs.neu.edu Kapil Arya Tyler Denniston Praveen S. Solanki Gene Cooperman Northeastern University {kapil,tyler,psolanki,gene}@ccs.neu.edu Abstract This work addresses two classical problems in debugging. ...","internal_url":"https://www.academia.edu/26098718/Temporal_Debugging_Automating_Time_Travel_Debugging_with_URDB","translated_internal_url":"","created_at":"2016-06-13T17:28:51.638-07:00","preview_url":null,"current_user_can_edit":null,"current_user_is_owner":null,"owner_id":50013738,"coauthors_can_edit":true,"document_type":"paper","co_author_tags":[{"id":21246036,"work_id":26098718,"tagging_user_id":50013738,"tagged_user_id":46423,"co_author_invite_id":null,"email":"k***7@gmail.com","display_order":0,"name":"Kapil Arya","title":"Temporal Debugging: Automating Time Travel Debugging with URDB"},{"id":21246042,"work_id":26098718,"tagging_user_id":50013738,"tagged_user_id":null,"co_author_invite_id":4750018,"email":"t***r@csail.mit.edu","display_order":4194304,"name":"Tyler Denniston","title":"Temporal Debugging: Automating Time Travel Debugging with URDB"},{"id":21246044,"work_id":26098718,"tagging_user_id":50013738,"tagged_user_id":null,"co_author_invite_id":4762587,"email":"a***x@locomizer.com","display_order":6291456,"name":"A. Polyakov","title":"Temporal Debugging: Automating Time Travel Debugging with URDB"}],"downloadable_attachments":[],"slug":"Temporal_Debugging_Automating_Time_Travel_Debugging_with_URDB","translated_slug":"","page_count":null,"language":"en","content_type":"Work","summary":"... Xin Dong Northeastern University xindong@ccs.neu.edu Kapil Arya Tyler Denniston Praveen S. Solanki Gene Cooperman Northeastern University {kapil,tyler,psolanki,gene}@ccs.neu.edu Abstract This work addresses two classical problems in debugging. ...","owner":{"id":50013738,"first_name":"Ana-Maria","middle_initials":null,"last_name":"Visan","page_name":"AnaMariaVisan3","domain_name":"independent","created_at":"2016-06-13T17:27:43.265-07:00","display_name":"Ana-Maria Visan","url":"https://independent.academia.edu/AnaMariaVisan3"},"attachments":[],"research_interests":[],"urls":[]}, dispatcherData: dispatcherData }); $(this).data('initialized', true); } }); $a.trackClickSource(".js-work-strip-work-link", "profile_work_strip") if (false) { Aedu.setUpFigureCarousel('profile-work-26098718-figures'); } }); </script> </div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js","https://a.academia-assets.com/assets/google_contacts-0dfb882d836b94dbcb4a2d123d6933fc9533eda5be911641f20b4eb428429600.js"], function() { // from javascript_helper.rb $('.js-google-connect-button').click(function(e) { e.preventDefault(); GoogleContacts.authorize_and_show_contacts(); Aedu.Dismissibles.recordClickthrough("WowProfileImportContactsPrompt"); }); $('.js-update-biography-button').click(function(e) { e.preventDefault(); Aedu.Dismissibles.recordClickthrough("UpdateUserBiographyPrompt"); $.ajax({ url: $r.api_v0_profiles_update_about_path({ subdomain_param: 'api', about: "", }), type: 'PUT', success: function(response) { location.reload(); } }); }); $('.js-work-creator-button').click(function (e) { e.preventDefault(); window.location = $r.upload_funnel_document_path({ source: encodeURIComponent(""), }); }); $('.js-video-upload-button').click(function (e) { e.preventDefault(); window.location = $r.upload_funnel_video_path({ source: encodeURIComponent(""), }); }); $('.js-do-this-later-button').click(function() { $(this).closest('.js-profile-nag-panel').remove(); Aedu.Dismissibles.recordDismissal("WowProfileImportContactsPrompt"); }); $('.js-update-biography-do-this-later-button').click(function(){ $(this).closest('.js-profile-nag-panel').remove(); Aedu.Dismissibles.recordDismissal("UpdateUserBiographyPrompt"); }); $('.wow-profile-mentions-upsell--close').click(function(){ $('.wow-profile-mentions-upsell--panel').hide(); Aedu.Dismissibles.recordDismissal("WowProfileMentionsUpsell"); }); $('.wow-profile-mentions-upsell--button').click(function(){ Aedu.Dismissibles.recordClickthrough("WowProfileMentionsUpsell"); }); new WowProfile.SocialRedesignUserWorks({ initialWorksOffset: 20, allWorksOffset: 20, maxSections: 1 }) }); </script> </div></div></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/wow_profile_edit-5ea339ee107c863779f560dd7275595239fed73f1a13d279d2b599a28c0ecd33.js","https://a.academia-assets.com/assets/add_coauthor-22174b608f9cb871d03443cafa7feac496fb50d7df2d66a53f5ee3c04ba67f53.js","https://a.academia-assets.com/assets/tab-dcac0130902f0cc2d8cb403714dd47454f11fc6fb0e99ae6a0827b06613abc20.js","https://a.academia-assets.com/assets/wow_profile-a9bf3a2bc8c89fa2a77156577594264ee8a0f214d74241bc0fcd3f69f8d107ac.js"], function() { // from javascript_helper.rb window.ae = window.ae || {}; window.ae.WowProfile = window.ae.WowProfile || {}; if(Aedu.User.current && Aedu.User.current.id === $viewedUser.id) { window.ae.WowProfile.current_user_edit = {}; new WowProfileEdit.EditUploadView({ el: '.js-edit-upload-button-wrapper', model: window.$current_user, }); new AddCoauthor.AddCoauthorsController(); } var userInfoView = new WowProfile.SocialRedesignUserInfo({ recaptcha_key: "6LdxlRMTAAAAADnu_zyLhLg0YF9uACwz78shpjJB" }); WowProfile.router = new WowProfile.Router({ userInfoView: userInfoView }); Backbone.history.start({ pushState: true, root: "/" + $viewedUser.page_name }); new WowProfile.UserWorksNav() }); </script> </div> <div class="bootstrap login"><div class="modal fade login-modal" id="login-modal"><div class="login-modal-dialog modal-dialog"><div class="modal-content"><div class="modal-header"><button class="close close" data-dismiss="modal" type="button"><span aria-hidden="true">&times;</span><span class="sr-only">Close</span></button><h4 class="modal-title text-center"><strong>Log In</strong></h4></div><div class="modal-body"><div class="row"><div class="col-xs-10 col-xs-offset-1"><button class="btn btn-fb btn-lg btn-block btn-v-center-content" id="login-facebook-oauth-button"><svg style="float: left; width: 19px; line-height: 1em; margin-right: .3em;" aria-hidden="true" focusable="false" data-prefix="fab" data-icon="facebook-square" class="svg-inline--fa fa-facebook-square fa-w-14" role="img" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512"><path fill="currentColor" d="M400 32H48A48 48 0 0 0 0 80v352a48 48 0 0 0 48 48h137.25V327.69h-63V256h63v-54.64c0-62.15 37-96.48 93.67-96.48 27.14 0 55.52 4.84 55.52 4.84v61h-31.27c-30.81 0-40.42 19.12-40.42 38.73V256h68.78l-11 71.69h-57.78V480H400a48 48 0 0 0 48-48V80a48 48 0 0 0-48-48z"></path></svg><small><strong>Log in</strong> with <strong>Facebook</strong></small></button><br /><button class="btn btn-google btn-lg btn-block btn-v-center-content" id="login-google-oauth-button"><svg style="float: left; width: 22px; line-height: 1em; margin-right: .3em;" aria-hidden="true" focusable="false" data-prefix="fab" data-icon="google-plus" class="svg-inline--fa fa-google-plus fa-w-16" role="img" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><path fill="currentColor" d="M256,8C119.1,8,8,119.1,8,256S119.1,504,256,504,504,392.9,504,256,392.9,8,256,8ZM185.3,380a124,124,0,0,1,0-248c31.3,0,60.1,11,83,32.3l-33.6,32.6c-13.2-12.9-31.3-19.1-49.4-19.1-42.9,0-77.2,35.5-77.2,78.1S142.3,334,185.3,334c32.6,0,64.9-19.1,70.1-53.3H185.3V238.1H302.2a109.2,109.2,0,0,1,1.9,20.7c0,70.8-47.5,121.2-118.8,121.2ZM415.5,273.8v35.5H380V273.8H344.5V238.3H380V202.8h35.5v35.5h35.2v35.5Z"></path></svg><small><strong>Log in</strong> with <strong>Google</strong></small></button><br /><style type="text/css">.sign-in-with-apple-button { width: 100%; height: 52px; border-radius: 3px; border: 1px solid black; cursor: pointer; } .sign-in-with-apple-button > div { margin: 0 auto; / This centers the Apple-rendered button horizontally }</style><script src="https://appleid.cdn-apple.com/appleauth/static/jsapi/appleid/1/en_US/appleid.auth.js" type="text/javascript"></script><div class="sign-in-with-apple-button" data-border="false" data-color="white" id="appleid-signin"><span &nbsp;&nbsp;="Sign Up with Apple" class="u-fs11"></span></div><script>AppleID.auth.init({ clientId: 'edu.academia.applesignon', scope: 'name email', redirectURI: 'https://www.academia.edu/sessions', state: "34dace5db43c4c4e155d89323438faa811527dcd961e0f48dc6c702f61223474", });</script><script>// Hacky way of checking if on fast loswp if (window.loswp == null) { (function() { const Google = window?.Aedu?.Auth?.OauthButton?.Login?.Google; const Facebook = window?.Aedu?.Auth?.OauthButton?.Login?.Facebook; if (Google) { new Google({ el: '#login-google-oauth-button', rememberMeCheckboxId: 'remember_me', track: null }); } if (Facebook) { new Facebook({ el: '#login-facebook-oauth-button', rememberMeCheckboxId: 'remember_me', track: null }); } })(); }</script></div></div></div><div class="modal-body"><div class="row"><div class="col-xs-10 col-xs-offset-1"><div class="hr-heading login-hr-heading"><span class="hr-heading-text">or</span></div></div></div></div><div class="modal-body"><div class="row"><div class="col-xs-10 col-xs-offset-1"><form class="js-login-form" action="https://www.academia.edu/sessions" accept-charset="UTF-8" method="post"><input type="hidden" name="authenticity_token" value="ES_iT36y-9EHgFeXSFmCnNLf5K783kqCvO_4CwdMdbdJ1-Xr4YOOZfakBaXe7ilQEnf3k1B-VG2u3XOKlc9HHw" autocomplete="off" /><div class="form-group"><label class="control-label" for="login-modal-email-input" style="font-size: 14px;">Email</label><input class="form-control" id="login-modal-email-input" name="login" type="email" /></div><div class="form-group"><label class="control-label" for="login-modal-password-input" style="font-size: 14px;">Password</label><input class="form-control" id="login-modal-password-input" name="password" type="password" /></div><input type="hidden" name="post_login_redirect_url" id="post_login_redirect_url" value="https://independent.academia.edu/KapilArya2" autocomplete="off" /><div class="checkbox"><label><input type="checkbox" name="remember_me" id="remember_me" value="1" checked="checked" /><small style="font-size: 12px; margin-top: 2px; display: inline-block;">Remember me on this computer</small></label></div><br><input type="submit" name="commit" value="Log In" class="btn btn-primary btn-block btn-lg js-login-submit" data-disable-with="Log In" /></br></form><script>typeof window?.Aedu?.recaptchaManagedForm === 'function' && window.Aedu.recaptchaManagedForm( document.querySelector('.js-login-form'), document.querySelector('.js-login-submit') );</script><small style="font-size: 12px;"><br />or <a data-target="#login-modal-reset-password-container" data-toggle="collapse" href="javascript:void(0)">reset password</a></small><div class="collapse" id="login-modal-reset-password-container"><br /><div class="well margin-0x"><form class="js-password-reset-form" action="https://www.academia.edu/reset_password" accept-charset="UTF-8" method="post"><input type="hidden" name="authenticity_token" value="ixD27z9tu1ax8JnQR1r2sd1mtSJAIrDdfMLoIPtvT27T6PFLoFzO4kDUy-LR7V19Hc6mH-yCrjJu8GOhaex9xg" autocomplete="off" /><p>Enter the email address you signed up with and we&#39;ll email you a reset link.</p><div class="form-group"><input class="form-control" name="email" type="email" /></div><script src="https://recaptcha.net/recaptcha/api.js" async defer></script> <script> var invisibleRecaptchaSubmit = function () { var closestForm = function (ele) { var curEle = ele.parentNode; while (curEle.nodeName !== 'FORM' && curEle.nodeName !== 'BODY'){ curEle = curEle.parentNode; } return curEle.nodeName === 'FORM' ? curEle : null }; var eles = document.getElementsByClassName('g-recaptcha'); if (eles.length > 0) { var form = closestForm(eles[0]); if (form) { form.submit(); } } }; </script> <input type="submit" data-sitekey="6Lf3KHUUAAAAACggoMpmGJdQDtiyrjVlvGJ6BbAj" data-callback="invisibleRecaptchaSubmit" class="g-recaptcha btn btn-primary btn-block" value="Email me a link" value=""/> </form></div></div><script> require.config({ waitSeconds: 90 })(["https://a.academia-assets.com/assets/collapse-45805421cf446ca5adf7aaa1935b08a3a8d1d9a6cc5d91a62a2a3a00b20b3e6a.js"], function() { // from javascript_helper.rb $("#login-modal-reset-password-container").on("shown.bs.collapse", function() { $(this).find("input[type=email]").focus(); }); }); </script> </div></div></div><div class="modal-footer"><div class="text-center"><small style="font-size: 12px;">Need an account?&nbsp;<a rel="nofollow" href="https://www.academia.edu/signup">Click here to sign up</a></small></div></div></div></div></div></div><script>// If we are on subdomain or non-bootstrapped page, redirect to login page instead of showing modal (function(){ if (typeof $ === 'undefined') return; var host = window.location.hostname; if ((host === $domain || host === "www."+$domain) && (typeof $().modal === 'function')) { $("#nav_log_in").click(function(e) { // Don't follow the link and open the modal e.preventDefault(); $("#login-modal").on('shown.bs.modal', function() { $(this).find("#login-modal-email-input").focus() }).modal('show'); }); } })()</script> <div class="bootstrap" id="footer"><div class="footer-content clearfix text-center padding-top-7x" style="width:100%;"><ul class="footer-links-secondary footer-links-wide list-inline margin-bottom-1x"><li><a href="https://www.academia.edu/about">About</a></li><li><a href="https://www.academia.edu/press">Press</a></li><li><a href="https://www.academia.edu/documents">Papers</a></li><li><a href="https://www.academia.edu/topics">Topics</a></li><li><a href="https://www.academia.edu/journals">Academia.edu Journals</a></li><li><a rel="nofollow" href="https://www.academia.edu/hiring"><svg style="width: 13px; height: 13px;" aria-hidden="true" focusable="false" data-prefix="fas" data-icon="briefcase" class="svg-inline--fa fa-briefcase fa-w-16" role="img" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><path fill="currentColor" d="M320 336c0 8.84-7.16 16-16 16h-96c-8.84 0-16-7.16-16-16v-48H0v144c0 25.6 22.4 48 48 48h416c25.6 0 48-22.4 48-48V288H320v48zm144-208h-80V80c0-25.6-22.4-48-48-48H176c-25.6 0-48 22.4-48 48v48H48c-25.6 0-48 22.4-48 48v80h512v-80c0-25.6-22.4-48-48-48zm-144 0H192V96h128v32z"></path></svg>&nbsp;<strong>We're Hiring!</strong></a></li><li><a rel="nofollow" href="https://support.academia.edu/hc/en-us"><svg style="width: 12px; height: 12px;" aria-hidden="true" focusable="false" data-prefix="fas" data-icon="question-circle" class="svg-inline--fa fa-question-circle fa-w-16" role="img" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"><path fill="currentColor" d="M504 256c0 136.997-111.043 248-248 248S8 392.997 8 256C8 119.083 119.043 8 256 8s248 111.083 248 248zM262.655 90c-54.497 0-89.255 22.957-116.549 63.758-3.536 5.286-2.353 12.415 2.715 16.258l34.699 26.31c5.205 3.947 12.621 3.008 16.665-2.122 17.864-22.658 30.113-35.797 57.303-35.797 20.429 0 45.698 13.148 45.698 32.958 0 14.976-12.363 22.667-32.534 33.976C247.128 238.528 216 254.941 216 296v4c0 6.627 5.373 12 12 12h56c6.627 0 12-5.373 12-12v-1.333c0-28.462 83.186-29.647 83.186-106.667 0-58.002-60.165-102-116.531-102zM256 338c-25.365 0-46 20.635-46 46 0 25.364 20.635 46 46 46s46-20.636 46-46c0-25.365-20.635-46-46-46z"></path></svg>&nbsp;<strong>Help Center</strong></a></li></ul><ul class="footer-links-tertiary list-inline margin-bottom-1x"><li class="small">Find new research papers in:</li><li class="small"><a href="https://www.academia.edu/Documents/in/Physics">Physics</a></li><li class="small"><a href="https://www.academia.edu/Documents/in/Chemistry">Chemistry</a></li><li class="small"><a href="https://www.academia.edu/Documents/in/Biology">Biology</a></li><li class="small"><a href="https://www.academia.edu/Documents/in/Health_Sciences">Health Sciences</a></li><li class="small"><a href="https://www.academia.edu/Documents/in/Ecology">Ecology</a></li><li class="small"><a href="https://www.academia.edu/Documents/in/Earth_Sciences">Earth Sciences</a></li><li class="small"><a href="https://www.academia.edu/Documents/in/Cognitive_Science">Cognitive Science</a></li><li class="small"><a href="https://www.academia.edu/Documents/in/Mathematics">Mathematics</a></li><li class="small"><a href="https://www.academia.edu/Documents/in/Computer_Science">Computer Science</a></li></ul></div></div><div class="DesignSystem" id="credit" style="width:100%;"><ul class="u-pl0x footer-links-legal list-inline"><li><a rel="nofollow" href="https://www.academia.edu/terms">Terms</a></li><li><a rel="nofollow" href="https://www.academia.edu/privacy">Privacy</a></li><li><a rel="nofollow" href="https://www.academia.edu/copyright">Copyright</a></li><li>Academia &copy;2025</li></ul></div><script> //<![CDATA[ window.detect_gmtoffset = true; window.Academia && window.Academia.set_gmtoffset && Academia.set_gmtoffset('/gmtoffset'); //]]> </script> <div id='overlay_background'></div> <div id='bootstrap-modal-container' class='bootstrap'></div> <div id='ds-modal-container' class='bootstrap DesignSystem'></div> <div id='full-screen-modal'></div> </div> </body> </html>

Pages: 1 2 3 4 5 6 7 8 9 10