CINXE.COM
Floating-point arithmetic - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Floating-point arithmetic - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"d4993929-d719-4458-8550-b429eec3d7c8","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Floating-point_arithmetic","wgTitle":"Floating-point arithmetic","wgCurRevisionId":1280068135,"wgRevisionId":1280068135,"wgArticleId":11376,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 German-language sources (de)","Webarchive template wayback links","Articles with short description","Short description is different from Wikidata","Use dmy dates from May 2019","All articles with unsourced statements","Articles with unsourced statements from July 2020","Wikipedia articles needing clarification from November 2024","Articles with unsourced statements from June 2016","Articles with example C code","Floating point","Computer arithmetic"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Floating-point_arithmetic","wgRelevantArticleId":11376,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgRedirectedFrom":"Floating_point","wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true},"wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":100000,"wgInternalRedirectTargetUrl":"/wiki/Floating-point_arithmetic","wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q117879","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false}; RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.math.styles":"ready","ext.pygments":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready","jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready"};RLPAGEMODULES=["mediawiki.action.view.redirect","ext.cite.ux-enhancements","ext.pygments.view","mediawiki.page.media","ext.scribunto.logs","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.math.styles%7Cext.pygments%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.20"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta property="og:image" content="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Z3_Deutsches_Museum.JPG/1200px-Z3_Deutsches_Museum.JPG"> <meta property="og:image:width" content="1200"> <meta property="og:image:height" content="900"> <meta property="og:image" content="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Z3_Deutsches_Museum.JPG/800px-Z3_Deutsches_Museum.JPG"> <meta property="og:image:width" content="800"> <meta property="og:image:height" content="600"> <meta property="og:image" content="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Z3_Deutsches_Museum.JPG/640px-Z3_Deutsches_Museum.JPG"> <meta property="og:image:width" content="640"> <meta property="og:image:height" content="480"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Floating-point arithmetic - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Floating-point_arithmetic"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Floating-point_arithmetic&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Floating-point_arithmetic"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Floating-point_arithmetic rootpage-Floating-point_arithmetic skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" title="Main menu" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li><li id="n-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages"><span>Special pages</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Floating-point+arithmetic" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Floating-point+arithmetic" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Floating-point+arithmetic" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Floating-point+arithmetic" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-Overview" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Overview"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>Overview</span> </div> </a> <button aria-controls="toc-Overview-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Overview subsection</span> </button> <ul id="toc-Overview-sublist" class="vector-toc-list"> <li id="toc-Floating-point_numbers" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Floating-point_numbers"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.1</span> <span>Floating-point numbers</span> </div> </a> <ul id="toc-Floating-point_numbers-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Alternatives_to_floating-point_numbers" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Alternatives_to_floating-point_numbers"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.2</span> <span>Alternatives to floating-point numbers</span> </div> </a> <ul id="toc-Alternatives_to_floating-point_numbers-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-History" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#History"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>History</span> </div> </a> <ul id="toc-History-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Range_of_floating-point_numbers" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Range_of_floating-point_numbers"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Range of floating-point numbers</span> </div> </a> <ul id="toc-Range_of_floating-point_numbers-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-IEEE_754:_floating_point_in_modern_computers" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#IEEE_754:_floating_point_in_modern_computers"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>IEEE 754: floating point in modern computers</span> </div> </a> <button aria-controls="toc-IEEE_754:_floating_point_in_modern_computers-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle IEEE 754: floating point in modern computers subsection</span> </button> <ul id="toc-IEEE_754:_floating_point_in_modern_computers-sublist" class="vector-toc-list"> <li id="toc-Internal_representation" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Internal_representation"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.1</span> <span>Internal representation</span> </div> </a> <ul id="toc-Internal_representation-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Other_notable_floating-point_formats" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Other_notable_floating-point_formats"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Other notable floating-point formats</span> </div> </a> <ul id="toc-Other_notable_floating-point_formats-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Representable_numbers,_conversion_and_rounding" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Representable_numbers,_conversion_and_rounding"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>Representable numbers, conversion and rounding</span> </div> </a> <button aria-controls="toc-Representable_numbers,_conversion_and_rounding-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Representable numbers, conversion and rounding subsection</span> </button> <ul id="toc-Representable_numbers,_conversion_and_rounding-sublist" class="vector-toc-list"> <li id="toc-Rounding_modes" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Rounding_modes"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.1</span> <span>Rounding modes</span> </div> </a> <ul id="toc-Rounding_modes-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Binary-to-decimal_conversion_with_minimal_number_of_digits" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Binary-to-decimal_conversion_with_minimal_number_of_digits"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.2</span> <span>Binary-to-decimal conversion with minimal number of digits</span> </div> </a> <ul id="toc-Binary-to-decimal_conversion_with_minimal_number_of_digits-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Decimal-to-binary_conversion" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Decimal-to-binary_conversion"> <div class="vector-toc-text"> <span class="vector-toc-numb">6.3</span> <span>Decimal-to-binary conversion</span> </div> </a> <ul id="toc-Decimal-to-binary_conversion-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Floating-point_operations" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Floating-point_operations"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>Floating-point operations</span> </div> </a> <button aria-controls="toc-Floating-point_operations-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Floating-point operations subsection</span> </button> <ul id="toc-Floating-point_operations-sublist" class="vector-toc-list"> <li id="toc-Addition_and_subtraction" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Addition_and_subtraction"> <div class="vector-toc-text"> <span class="vector-toc-numb">7.1</span> <span>Addition and subtraction</span> </div> </a> <ul id="toc-Addition_and_subtraction-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Multiplication_and_division" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Multiplication_and_division"> <div class="vector-toc-text"> <span class="vector-toc-numb">7.2</span> <span>Multiplication and division</span> </div> </a> <ul id="toc-Multiplication_and_division-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Literal_syntax" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Literal_syntax"> <div class="vector-toc-text"> <span class="vector-toc-numb">7.3</span> <span>Literal syntax</span> </div> </a> <ul id="toc-Literal_syntax-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Dealing_with_exceptional_cases" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Dealing_with_exceptional_cases"> <div class="vector-toc-text"> <span class="vector-toc-numb">8</span> <span>Dealing with exceptional cases</span> </div> </a> <ul id="toc-Dealing_with_exceptional_cases-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Accuracy_problems" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Accuracy_problems"> <div class="vector-toc-text"> <span class="vector-toc-numb">9</span> <span>Accuracy problems</span> </div> </a> <button aria-controls="toc-Accuracy_problems-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Accuracy problems subsection</span> </button> <ul id="toc-Accuracy_problems-sublist" class="vector-toc-list"> <li id="toc-Incidents" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Incidents"> <div class="vector-toc-text"> <span class="vector-toc-numb">9.1</span> <span>Incidents</span> </div> </a> <ul id="toc-Incidents-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Machine_precision_and_backward_error_analysis" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Machine_precision_and_backward_error_analysis"> <div class="vector-toc-text"> <span class="vector-toc-numb">9.2</span> <span>Machine precision and backward error analysis</span> </div> </a> <ul id="toc-Machine_precision_and_backward_error_analysis-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Minimizing_the_effect_of_accuracy_problems" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Minimizing_the_effect_of_accuracy_problems"> <div class="vector-toc-text"> <span class="vector-toc-numb">9.3</span> <span>Minimizing the effect of accuracy problems</span> </div> </a> <ul id="toc-Minimizing_the_effect_of_accuracy_problems-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-"Fast_math"_optimization" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#"Fast_math"_optimization"> <div class="vector-toc-text"> <span class="vector-toc-numb">9.4</span> <span>"Fast math" optimization</span> </div> </a> <ul id="toc-"Fast_math"_optimization-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">10</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Notes" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Notes"> <div class="vector-toc-text"> <span class="vector-toc-numb">11</span> <span>Notes</span> </div> </a> <ul id="toc-Notes-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">12</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Further_reading" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Further_reading"> <div class="vector-toc-text"> <span class="vector-toc-numb">13</span> <span>Further reading</span> </div> </a> <ul id="toc-Further_reading-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-External_links" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#External_links"> <div class="vector-toc-text"> <span class="vector-toc-numb">14</span> <span>External links</span> </div> </a> <ul id="toc-External_links-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" title="Table of Contents" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Floating-point arithmetic</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 44 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-44" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">44 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D8%AD%D8%B3%D8%A7%D8%A8%D8%A7%D8%AA_%D8%A7%D9%84%D9%81%D8%A7%D8%B5%D9%84%D8%A9_%D8%A7%D9%84%D9%85%D8%AA%D8%AD%D8%B1%D9%83%D8%A9" title="حسابات الفاصلة المتحركة – Arabic" lang="ar" hreflang="ar" data-title="حسابات الفاصلة المتحركة" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-bg mw-list-item"><a href="https://bg.wikipedia.org/wiki/%D0%90%D1%80%D0%B8%D1%82%D0%BC%D0%B5%D1%82%D0%B8%D0%BA%D0%B0_%D1%81_%D0%BF%D0%BB%D0%B0%D0%B2%D0%B0%D1%89%D0%B0_%D0%B7%D0%B0%D0%BF%D0%B5%D1%82%D0%B0%D1%8F" title="Аритметика с плаваща запетая – Bulgarian" lang="bg" hreflang="bg" data-title="Аритметика с плаваща запетая" data-language-autonym="Български" data-language-local-name="Bulgarian" class="interlanguage-link-target"><span>Български</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Coma_flotant" title="Coma flotant – Catalan" lang="ca" hreflang="ca" data-title="Coma flotant" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-cs mw-list-item"><a href="https://cs.wikipedia.org/wiki/Pohybliv%C3%A1_%C5%99%C3%A1dov%C3%A1_%C4%8D%C3%A1rka" title="Pohyblivá řádová čárka – Czech" lang="cs" hreflang="cs" data-title="Pohyblivá řádová čárka" data-language-autonym="Čeština" data-language-local-name="Czech" class="interlanguage-link-target"><span>Čeština</span></a></li><li class="interlanguage-link interwiki-da mw-list-item"><a href="https://da.wikipedia.org/wiki/Flydende_tal" title="Flydende tal – Danish" lang="da" hreflang="da" data-title="Flydende tal" data-language-autonym="Dansk" data-language-local-name="Danish" class="interlanguage-link-target"><span>Dansk</span></a></li><li class="interlanguage-link interwiki-de mw-list-item"><a href="https://de.wikipedia.org/wiki/Gleitkommazahl" title="Gleitkommazahl – German" lang="de" hreflang="de" data-title="Gleitkommazahl" data-language-autonym="Deutsch" data-language-local-name="German" class="interlanguage-link-target"><span>Deutsch</span></a></li><li class="interlanguage-link interwiki-et mw-list-item"><a href="https://et.wikipedia.org/wiki/Ujukomaarv" title="Ujukomaarv – Estonian" lang="et" hreflang="et" data-title="Ujukomaarv" data-language-autonym="Eesti" data-language-local-name="Estonian" class="interlanguage-link-target"><span>Eesti</span></a></li><li class="interlanguage-link interwiki-el mw-list-item"><a href="https://el.wikipedia.org/wiki/%CE%A4%CF%8D%CF%80%CE%BF%CF%82_%CE%BA%CE%B9%CE%BD%CE%B7%CF%84%CE%AE%CF%82_%CF%85%CF%80%CE%BF%CE%B4%CE%B9%CE%B1%CF%83%CF%84%CE%BF%CE%BB%CE%AE%CF%82" title="Τύπος κινητής υποδιαστολής – Greek" lang="el" hreflang="el" data-title="Τύπος κινητής υποδιαστολής" data-language-autonym="Ελληνικά" data-language-local-name="Greek" class="interlanguage-link-target"><span>Ελληνικά</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Coma_flotante" title="Coma flotante – Spanish" lang="es" hreflang="es" data-title="Coma flotante" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-eo mw-list-item"><a href="https://eo.wikipedia.org/wiki/Glitkomo" title="Glitkomo – Esperanto" lang="eo" hreflang="eo" data-title="Glitkomo" data-language-autonym="Esperanto" data-language-local-name="Esperanto" class="interlanguage-link-target"><span>Esperanto</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%D9%85%D8%AD%D8%A7%D8%B3%D8%A8%D8%A7%D8%AA_%D9%85%D9%85%DB%8C%D8%B2_%D8%B4%D9%86%D8%A7%D9%88%D8%B1" title="محاسبات ممیز شناور – Persian" lang="fa" hreflang="fa" data-title="محاسبات ممیز شناور" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Virgule_flottante" title="Virgule flottante – French" lang="fr" hreflang="fr" data-title="Virgule flottante" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ga mw-list-item"><a href="https://ga.wikipedia.org/wiki/Uimhir_shn%C3%A1mhphointe" title="Uimhir shnámhphointe – Irish" lang="ga" hreflang="ga" data-title="Uimhir shnámhphointe" data-language-autonym="Gaeilge" data-language-local-name="Irish" class="interlanguage-link-target"><span>Gaeilge</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%EB%B6%80%EB%8F%99%EC%86%8C%EC%88%98%EC%A0%90" title="부동소수점 – Korean" lang="ko" hreflang="ko" data-title="부동소수점" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-io mw-list-item"><a href="https://io.wikipedia.org/wiki/Flotac-komo" title="Flotac-komo – Ido" lang="io" hreflang="io" data-title="Flotac-komo" data-language-autonym="Ido" data-language-local-name="Ido" class="interlanguage-link-target"><span>Ido</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Aritmetika_titik_kambang" title="Aritmetika titik kambang – Indonesian" lang="id" hreflang="id" data-title="Aritmetika titik kambang" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Numero_in_virgola_mobile" title="Numero in virgola mobile – Italian" lang="it" hreflang="it" data-title="Numero in virgola mobile" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-he mw-list-item"><a href="https://he.wikipedia.org/wiki/%D7%A0%D7%A7%D7%95%D7%93%D7%94_%D7%A6%D7%A4%D7%94" title="נקודה צפה – Hebrew" lang="he" hreflang="he" data-title="נקודה צפה" data-language-autonym="עברית" data-language-local-name="Hebrew" class="interlanguage-link-target"><span>עברית</span></a></li><li class="interlanguage-link interwiki-lv mw-list-item"><a href="https://lv.wikipedia.org/wiki/Peldo%C5%A1ais_komats" title="Peldošais komats – Latvian" lang="lv" hreflang="lv" data-title="Peldošais komats" data-language-autonym="Latviešu" data-language-local-name="Latvian" class="interlanguage-link-target"><span>Latviešu</span></a></li><li class="interlanguage-link interwiki-lmo mw-list-item"><a href="https://lmo.wikipedia.org/wiki/Virgola_mobil" title="Virgola mobil – Lombard" lang="lmo" hreflang="lmo" data-title="Virgola mobil" data-language-autonym="Lombard" data-language-local-name="Lombard" class="interlanguage-link-target"><span>Lombard</span></a></li><li class="interlanguage-link interwiki-hu mw-list-item"><a href="https://hu.wikipedia.org/wiki/Lebeg%C5%91pontos_sz%C3%A1m%C3%A1br%C3%A1zol%C3%A1s" title="Lebegőpontos számábrázolás – Hungarian" lang="hu" hreflang="hu" data-title="Lebegőpontos számábrázolás" data-language-autonym="Magyar" data-language-local-name="Hungarian" class="interlanguage-link-target"><span>Magyar</span></a></li><li class="interlanguage-link interwiki-mg mw-list-item"><a href="https://mg.wikipedia.org/wiki/Faingo_mihevaheva" title="Faingo mihevaheva – Malagasy" lang="mg" hreflang="mg" data-title="Faingo mihevaheva" data-language-autonym="Malagasy" data-language-local-name="Malagasy" class="interlanguage-link-target"><span>Malagasy</span></a></li><li class="interlanguage-link interwiki-ml mw-list-item"><a href="https://ml.wikipedia.org/wiki/%E0%B4%AB%E0%B5%8D%E0%B4%B2%E0%B5%8B%E0%B4%9F%E0%B5%8D%E0%B4%9F%E0%B4%BF%E0%B4%99%E0%B5%8D%E0%B4%99%E0%B5%8D_%E0%B4%AA%E0%B5%8B%E0%B4%AF%E0%B4%BF%E0%B4%A8%E0%B5%8D%E0%B4%B1%E0%B5%8D" title="ഫ്ലോട്ടിങ്ങ് പോയിന്റ് – Malayalam" lang="ml" hreflang="ml" data-title="ഫ്ലോട്ടിങ്ങ് പോയിന്റ്" data-language-autonym="മലയാളം" data-language-local-name="Malayalam" class="interlanguage-link-target"><span>മലയാളം</span></a></li><li class="interlanguage-link interwiki-mn mw-list-item"><a href="https://mn.wikipedia.org/wiki/%D0%A5%D3%A9%D1%80%D0%B2%D3%A9%D1%85_%D1%86%D1%8D%D0%B3%D0%B8%D0%B9%D0%BD_%D0%B0%D1%80%D0%B8%D1%84%D0%BC%D0%B5%D1%82%D0%B8%D0%BA" title="Хөрвөх цэгийн арифметик – Mongolian" lang="mn" hreflang="mn" data-title="Хөрвөх цэгийн арифметик" data-language-autonym="Монгол" data-language-local-name="Mongolian" class="interlanguage-link-target"><span>Монгол</span></a></li><li class="interlanguage-link interwiki-nl mw-list-item"><a href="https://nl.wikipedia.org/wiki/Zwevendekommagetal" title="Zwevendekommagetal – Dutch" lang="nl" hreflang="nl" data-title="Zwevendekommagetal" data-language-autonym="Nederlands" data-language-local-name="Dutch" class="interlanguage-link-target"><span>Nederlands</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E6%B5%AE%E5%8B%95%E5%B0%8F%E6%95%B0%E7%82%B9%E6%95%B0" title="浮動小数点数 – Japanese" lang="ja" hreflang="ja" data-title="浮動小数点数" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-no mw-list-item"><a href="https://no.wikipedia.org/wiki/Flyttall" title="Flyttall – Norwegian Bokmål" lang="nb" hreflang="nb" data-title="Flyttall" data-language-autonym="Norsk bokmål" data-language-local-name="Norwegian Bokmål" class="interlanguage-link-target"><span>Norsk bokmål</span></a></li><li class="interlanguage-link interwiki-pl mw-list-item"><a href="https://pl.wikipedia.org/wiki/Liczba_zmiennoprzecinkowa" title="Liczba zmiennoprzecinkowa – Polish" lang="pl" hreflang="pl" data-title="Liczba zmiennoprzecinkowa" data-language-autonym="Polski" data-language-local-name="Polish" class="interlanguage-link-target"><span>Polski</span></a></li><li class="interlanguage-link interwiki-pt mw-list-item"><a href="https://pt.wikipedia.org/wiki/V%C3%ADrgula_flutuante" title="Vírgula flutuante – Portuguese" lang="pt" hreflang="pt" data-title="Vírgula flutuante" data-language-autonym="Português" data-language-local-name="Portuguese" class="interlanguage-link-target"><span>Português</span></a></li><li class="interlanguage-link interwiki-ro mw-list-item"><a href="https://ro.wikipedia.org/wiki/Virgul%C4%83_mobil%C4%83" title="Virgulă mobilă – Romanian" lang="ro" hreflang="ro" data-title="Virgulă mobilă" data-language-autonym="Română" data-language-local-name="Romanian" class="interlanguage-link-target"><span>Română</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%A7%D0%B8%D1%81%D0%BB%D0%BE_%D1%81_%D0%BF%D0%BB%D0%B0%D0%B2%D0%B0%D1%8E%D1%89%D0%B5%D0%B9_%D0%B7%D0%B0%D0%BF%D1%8F%D1%82%D0%BE%D0%B9" title="Число с плавающей запятой – Russian" lang="ru" hreflang="ru" data-title="Число с плавающей запятой" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-sq mw-list-item"><a href="https://sq.wikipedia.org/wiki/Float" title="Float – Albanian" lang="sq" hreflang="sq" data-title="Float" data-language-autonym="Shqip" data-language-local-name="Albanian" class="interlanguage-link-target"><span>Shqip</span></a></li><li class="interlanguage-link interwiki-simple mw-list-item"><a href="https://simple.wikipedia.org/wiki/Floating_point" title="Floating point – Simple English" lang="en-simple" hreflang="en-simple" data-title="Floating point" data-language-autonym="Simple English" data-language-local-name="Simple English" class="interlanguage-link-target"><span>Simple English</span></a></li><li class="interlanguage-link interwiki-sk mw-list-item"><a href="https://sk.wikipedia.org/wiki/Pohybliv%C3%A1_r%C3%A1dov%C3%A1_%C4%8Diarka" title="Pohyblivá rádová čiarka – Slovak" lang="sk" hreflang="sk" data-title="Pohyblivá rádová čiarka" data-language-autonym="Slovenčina" data-language-local-name="Slovak" class="interlanguage-link-target"><span>Slovenčina</span></a></li><li class="interlanguage-link interwiki-sl mw-list-item"><a href="https://sl.wikipedia.org/wiki/Plavajo%C4%8Da_vejica" title="Plavajoča vejica – Slovenian" lang="sl" hreflang="sl" data-title="Plavajoča vejica" data-language-autonym="Slovenščina" data-language-local-name="Slovenian" class="interlanguage-link-target"><span>Slovenščina</span></a></li><li class="interlanguage-link interwiki-sr mw-list-item"><a href="https://sr.wikipedia.org/wiki/%D0%90%D1%80%D0%B8%D1%82%D0%BC%D0%B5%D1%82%D0%B8%D0%BA%D0%B0_%D1%81%D0%B0_%D0%BF%D0%BE%D0%BA%D1%80%D0%B5%D1%82%D0%BD%D0%B8%D0%BC_%D0%B7%D0%B0%D1%80%D0%B5%D0%B7%D0%BE%D0%BC" title="Аритметика са покретним зарезом – Serbian" lang="sr" hreflang="sr" data-title="Аритметика са покретним зарезом" data-language-autonym="Српски / srpski" data-language-local-name="Serbian" class="interlanguage-link-target"><span>Српски / srpski</span></a></li><li class="interlanguage-link interwiki-fi mw-list-item"><a href="https://fi.wikipedia.org/wiki/Liukuluku" title="Liukuluku – Finnish" lang="fi" hreflang="fi" data-title="Liukuluku" data-language-autonym="Suomi" data-language-local-name="Finnish" class="interlanguage-link-target"><span>Suomi</span></a></li><li class="interlanguage-link interwiki-sv mw-list-item"><a href="https://sv.wikipedia.org/wiki/Flyttal" title="Flyttal – Swedish" lang="sv" hreflang="sv" data-title="Flyttal" data-language-autonym="Svenska" data-language-local-name="Swedish" class="interlanguage-link-target"><span>Svenska</span></a></li><li class="interlanguage-link interwiki-th mw-list-item"><a href="https://th.wikipedia.org/wiki/%E0%B8%88%E0%B8%B3%E0%B8%99%E0%B8%A7%E0%B8%99%E0%B8%88%E0%B8%B8%E0%B8%94%E0%B8%A5%E0%B8%AD%E0%B8%A2%E0%B8%95%E0%B8%B1%E0%B8%A7" title="จำนวนจุดลอยตัว – Thai" lang="th" hreflang="th" data-title="จำนวนจุดลอยตัว" data-language-autonym="ไทย" data-language-local-name="Thai" class="interlanguage-link-target"><span>ไทย</span></a></li><li class="interlanguage-link interwiki-tr mw-list-item"><a href="https://tr.wikipedia.org/wiki/Kayan_nokta" title="Kayan nokta – Turkish" lang="tr" hreflang="tr" data-title="Kayan nokta" data-language-autonym="Türkçe" data-language-local-name="Turkish" class="interlanguage-link-target"><span>Türkçe</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%A7%D0%B8%D1%81%D0%BB%D0%BE_%D0%B7_%D1%80%D1%83%D1%85%D0%BE%D0%BC%D0%BE%D1%8E_%D0%BA%D0%BE%D0%BC%D0%BE%D1%8E" title="Число з рухомою комою – Ukrainian" lang="uk" hreflang="uk" data-title="Число з рухомою комою" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-vi mw-list-item"><a href="https://vi.wikipedia.org/wiki/S%E1%BB%91_th%E1%BB%B1c_d%E1%BA%A5u_ph%E1%BA%A9y_%C4%91%E1%BB%99ng" title="Số thực dấu phẩy động – Vietnamese" lang="vi" hreflang="vi" data-title="Số thực dấu phẩy động" data-language-autonym="Tiếng Việt" data-language-local-name="Vietnamese" class="interlanguage-link-target"><span>Tiếng Việt</span></a></li><li class="interlanguage-link interwiki-zh-yue mw-list-item"><a href="https://zh-yue.wikipedia.org/wiki/%E6%B5%AE%E9%BB%9E%E6%95%B8" title="浮點數 – Cantonese" lang="yue" hreflang="yue" data-title="浮點數" data-language-autonym="粵語" data-language-local-name="Cantonese" class="interlanguage-link-target"><span>粵語</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/%E6%B5%AE%E7%82%B9%E6%95%B0%E8%BF%90%E7%AE%97" title="浮点数运算 – Chinese" lang="zh" hreflang="zh" data-title="浮点数运算" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q117879#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Floating-point_arithmetic" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Floating-point_arithmetic" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Floating-point_arithmetic"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Floating-point_arithmetic&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Floating-point_arithmetic&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Floating-point_arithmetic"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Floating-point_arithmetic&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Floating-point_arithmetic&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Floating-point_arithmetic" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Floating-point_arithmetic" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Floating-point_arithmetic&oldid=1280068135" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Floating-point_arithmetic&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Floating-point_arithmetic&id=1280068135&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FFloating-point_arithmetic"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FFloating-point_arithmetic"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Floating-point_arithmetic&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Floating-point_arithmetic&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="wb-otherproject-link wb-otherproject-commons mw-list-item"><a href="https://commons.wikimedia.org/wiki/Category:Floating_point" hreflang="en"><span>Wikimedia Commons</span></a></li><li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q117879" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"><span class="mw-redirectedfrom">(Redirected from <a href="/w/index.php?title=Floating_point&redirect=no" class="mw-redirect" title="Floating point">Floating point</a>)</span></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">"Floating point" redirects here. For other uses, see <a href="/wiki/Floating_point_(disambiguation)" class="mw-disambig" title="Floating point (disambiguation)">Floating point (disambiguation)</a>.</div> <div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Computer approximation for real numbers</div> <p class="mw-empty-elt"> </p> <figure typeof="mw:File/Thumb"><a href="/wiki/File:Z3_Deutsches_Museum.JPG" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Z3_Deutsches_Museum.JPG/200px-Z3_Deutsches_Museum.JPG" decoding="async" width="200" height="150" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Z3_Deutsches_Museum.JPG/300px-Z3_Deutsches_Museum.JPG 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Z3_Deutsches_Museum.JPG/400px-Z3_Deutsches_Museum.JPG 2x" data-file-width="1600" data-file-height="1200" /></a><figcaption>An early electromechanical programmable computer, the <a href="/wiki/Z3_(computer)" title="Z3 (computer)">Z3</a>, included floating-point arithmetic (replica on display at <a href="/wiki/Deutsches_Museum" title="Deutsches Museum">Deutsches Museum</a> in <a href="/wiki/Munich" title="Munich">Munich</a>).</figcaption></figure> <style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1126788409">.mw-parser-output .plainlist ol,.mw-parser-output .plainlist ul{line-height:inherit;list-style:none;margin:0;padding:0}.mw-parser-output .plainlist ol li,.mw-parser-output .plainlist ul li{margin-bottom:0}</style><style data-mw-deduplicate="TemplateStyles:r1246091330">.mw-parser-output .sidebar{width:22em;float:right;clear:right;margin:0.5em 0 1em 1em;background:var(--background-color-neutral-subtle,#f8f9fa);border:1px solid var(--border-color-base,#a2a9b1);padding:0.2em;text-align:center;line-height:1.4em;font-size:88%;border-collapse:collapse;display:table}body.skin-minerva .mw-parser-output .sidebar{display:table!important;float:right!important;margin:0.5em 0 1em 1em!important}.mw-parser-output .sidebar-subgroup{width:100%;margin:0;border-spacing:0}.mw-parser-output .sidebar-left{float:left;clear:left;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-none{float:none;clear:both;margin:0.5em 1em 1em 0}.mw-parser-output .sidebar-outer-title{padding:0 0.4em 0.2em;font-size:125%;line-height:1.2em;font-weight:bold}.mw-parser-output .sidebar-top-image{padding:0.4em}.mw-parser-output .sidebar-top-caption,.mw-parser-output .sidebar-pretitle-with-top-image,.mw-parser-output .sidebar-caption{padding:0.2em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-pretitle{padding:0.4em 0.4em 0;line-height:1.2em}.mw-parser-output .sidebar-title,.mw-parser-output .sidebar-title-with-pretitle{padding:0.2em 0.8em;font-size:145%;line-height:1.2em}.mw-parser-output .sidebar-title-with-pretitle{padding:0.1em 0.4em}.mw-parser-output .sidebar-image{padding:0.2em 0.4em 0.4em}.mw-parser-output .sidebar-heading{padding:0.1em 0.4em}.mw-parser-output .sidebar-content{padding:0 0.5em 0.4em}.mw-parser-output .sidebar-content-with-subgroup{padding:0.1em 0.4em 0.2em}.mw-parser-output .sidebar-above,.mw-parser-output .sidebar-below{padding:0.3em 0.8em;font-weight:bold}.mw-parser-output .sidebar-collapse .sidebar-above,.mw-parser-output .sidebar-collapse .sidebar-below{border-top:1px solid #aaa;border-bottom:1px solid #aaa}.mw-parser-output .sidebar-navbar{text-align:right;font-size:115%;padding:0 0.4em 0.4em}.mw-parser-output .sidebar-list-title{padding:0 0.4em;text-align:left;font-weight:bold;line-height:1.6em;font-size:105%}.mw-parser-output .sidebar-list-title-c{padding:0 0.4em;text-align:center;margin:0 3.3em}@media(max-width:640px){body.mediawiki .mw-parser-output .sidebar{width:100%!important;clear:both;float:none!important;margin-left:0!important;margin-right:0!important}}body.skin--responsive .mw-parser-output .sidebar a>img{max-width:none!important}@media screen{html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-night .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-list-title,html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle{background:transparent!important}html.skin-theme-clientpref-os .mw-parser-output .sidebar:not(.notheme) .sidebar-title-with-pretitle a{color:var(--color-progressive)!important}}@media print{body.ns-0 .mw-parser-output .sidebar{display:none!important}}</style><table class="sidebar nomobile nowraplinks plainlist"><tbody><tr><th class="sidebar-title"><a class="mw-selflink selflink">Floating-point</a> <a href="/wiki/Computer_number_format" title="Computer number format">formats</a></th></tr><tr><th class="sidebar-heading"> <a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a></th></tr><tr><td class="sidebar-content"> <ul><li>16-bit: <a href="/wiki/Half-precision_floating-point_format" title="Half-precision floating-point format">Half</a> (binary16)</li> <li>32-bit: <a href="/wiki/Single-precision_floating-point_format" title="Single-precision floating-point format">Single</a> (binary32), <a href="/wiki/Decimal32_floating-point_format" title="Decimal32 floating-point format">decimal32</a></li> <li>64-bit: <a href="/wiki/Double-precision_floating-point_format" title="Double-precision floating-point format">Double</a> (binary64), <a href="/wiki/Decimal64_floating-point_format" title="Decimal64 floating-point format">decimal64</a></li> <li>128-bit: <a href="/wiki/Quadruple-precision_floating-point_format" title="Quadruple-precision floating-point format">Quadruple</a> (binary128), <a href="/wiki/Decimal128_floating-point_format" title="Decimal128 floating-point format">decimal128</a></li> <li>256-bit: <a href="/wiki/Octuple-precision_floating-point_format" title="Octuple-precision floating-point format">Octuple</a> (binary256)</li> <li><a href="/wiki/Extended_precision" title="Extended precision">Extended precision</a></li></ul></td> </tr><tr><th class="sidebar-heading"> Other</th></tr><tr><td class="sidebar-content"> <ul><li><a href="/wiki/Minifloat" title="Minifloat">Minifloat</a></li> <li><a href="/wiki/Bfloat16_floating-point_format" title="Bfloat16 floating-point format">bfloat16</a></li> <li><a href="/wiki/TensorFloat-32" title="TensorFloat-32">TensorFloat-32</a></li> <li><a href="/wiki/Microsoft_Binary_Format" title="Microsoft Binary Format">Microsoft Binary Format</a></li> <li><a href="/wiki/IBM_hexadecimal_floating-point" title="IBM hexadecimal floating-point">IBM floating-point architecture</a></li> <li><a href="/wiki/Power_Management_Bus#Linear11_Floating-Point_Format" title="Power Management Bus">PMBus Linear-11</a></li> <li><a href="/wiki/G.711" title="G.711">G.711 8-bit floats</a></li></ul></td> </tr><tr><th class="sidebar-heading"> Alternatives</th></tr><tr><td class="sidebar-content"> <ul><li><a href="/wiki/Arbitrary-precision_arithmetic" title="Arbitrary-precision arithmetic">Arbitrary precision</a></li></ul></td> </tr><tr><th class="sidebar-heading"> <a href="/wiki/Tapered_floating_point" title="Tapered floating point">Tapered floating point</a></th></tr><tr><td class="sidebar-content"> <ul><li><a href="/wiki/Unum_(number_format)" title="Unum (number format)">Posit</a></li></ul></td> </tr><tr><td class="sidebar-navbar"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Floating-point" title="Template:Floating-point"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Floating-point" title="Template talk:Floating-point"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Floating-point" title="Special:EditPage/Template:Floating-point"><abbr title="Edit this template">e</abbr></a></li></ul></div></td></tr></tbody></table> <p>In <a href="/wiki/Computing" title="Computing">computing</a>, <b>floating-point arithmetic</b> (<b>FP</b>) is <a href="/wiki/Arithmetic" title="Arithmetic">arithmetic</a> on subsets of <a href="/wiki/Real_number" title="Real number">real numbers</a> formed by a <i><a href="/wiki/Significand" title="Significand">significand</a></i> (a <a href="/wiki/Sign_(mathematics)" title="Sign (mathematics)">signed</a> sequence of a fixed number of digits in some <a href="/wiki/Radix" title="Radix">base</a>) multiplied by an <a href="/wiki/Integer_power" class="mw-redirect" title="Integer power">integer power</a> of that base. Numbers of this form are called <b>floating-point numbers</b>.<sup id="cite_ref-Muller_2010_1-0" class="reference"><a href="#cite_note-Muller_2010-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page / location: 3">: 3 </span></sup><sup id="cite_ref-sterbenz1974fpcomp_2-0" class="reference"><a href="#cite_note-sterbenz1974fpcomp-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page / location: 10">: 10 </span></sup> </p><p>For example, the number 2469/200 is a floating-point number in base ten with five digits: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 2469/200=12.345=\!\underbrace {12345} _{\text{significand}}\!\times \!\underbrace {10} _{\text{base}}\!\!\!\!\!\!\!\overbrace {{}^{-3}} ^{\text{exponent}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>2469</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mn>200</mn> <mo>=</mo> <mn>12.345</mn> <mo>=</mo> <mspace width="negativethinmathspace"></mspace> <munder> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <munder> <mn>12345</mn> <mo>⏟<!-- ⏟ --></mo> </munder> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>significand</mtext> </mrow> </munder> <mspace width="negativethinmathspace"></mspace> <mo>×<!-- × --></mo> <mspace width="negativethinmathspace"></mspace> <munder> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <munder> <mn>10</mn> <mo>⏟<!-- ⏟ --></mo> </munder> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>base</mtext> </mrow> </munder> <mspace width="negativethinmathspace"></mspace> <mspace width="negativethinmathspace"></mspace> <mspace width="negativethinmathspace"></mspace> <mspace width="negativethinmathspace"></mspace> <mspace width="negativethinmathspace"></mspace> <mspace width="negativethinmathspace"></mspace> <mspace width="negativethinmathspace"></mspace> <mover> <mrow class="MJX-TeXAtom-OP MJX-fixedlimits"> <mover> <msup> <mrow class="MJX-TeXAtom-ORD"> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>3</mn> </mrow> </msup> <mo>⏞<!-- ⏞ --></mo> </mover> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>exponent</mtext> </mrow> </mover> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 2469/200=12.345=\!\underbrace {12345} _{\text{significand}}\!\times \!\underbrace {10} _{\text{base}}\!\!\!\!\!\!\!\overbrace {{}^{-3}} ^{\text{exponent}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0ccca5d6eada3559b1f5ab2f4e3bd2c7ace38003" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -4.171ex; width:38.947ex; height:9.843ex;" alt="{\displaystyle 2469/200=12.345=\!\underbrace {12345} _{\text{significand}}\!\times \!\underbrace {10} _{\text{base}}\!\!\!\!\!\!\!\overbrace {{}^{-3}} ^{\text{exponent}}}" /></span> However, 7716/625 = 12.3456 is not a floating-point number in base ten with five digits—it needs six digits. The nearest floating-point number with only five digits is 12.346. And 1/3 = 0.3333… is not a floating-point number in base ten with any finite number of digits. In practice, most floating-point systems use <a href="/wiki/Binary_number" title="Binary number">base two</a>, though base ten (<a href="/wiki/Decimal_floating_point" title="Decimal floating point">decimal floating point</a>) is also common. </p><p>Floating-point arithmetic operations, such as addition and division, approximate the corresponding real number arithmetic operations by <a href="/wiki/Rounding" title="Rounding">rounding</a> any result that is not a floating-point number itself to a nearby floating-point number.<sup id="cite_ref-Muller_2010_1-1" class="reference"><a href="#cite_note-Muller_2010-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page / location: 22">: 22 </span></sup><sup id="cite_ref-sterbenz1974fpcomp_2-1" class="reference"><a href="#cite_note-sterbenz1974fpcomp-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page / location: 10">: 10 </span></sup> For example, in a floating-point arithmetic with five base-ten digits, the sum 12.345 + 1.0001 = 13.3451 might be rounded to 13.345. </p><p>The term <i>floating point</i> refers to the fact that the number's <a href="/wiki/Radix_point" class="mw-redirect" title="Radix point">radix point</a> can "float" anywhere to the left, right, or between the <a href="/wiki/Significant_digits" class="mw-redirect" title="Significant digits">significant digits</a> of the number. This position is indicated by the exponent, so floating point can be considered a form of <a href="/wiki/Scientific_notation" title="Scientific notation">scientific notation</a>. </p><p>A floating-point system can be used to represent, with a fixed number of digits, numbers of very different <a href="/wiki/Orders_of_magnitude_(numbers)" title="Orders of magnitude (numbers)">orders of magnitude</a> — such as the number of meters <a href="/wiki/Orders_of_magnitude_(length)#100_zettametres" title="Orders of magnitude (length)">between galaxies</a> or <a href="/wiki/Orders_of_magnitude_(length)#10_femtometres" title="Orders of magnitude (length)">between protons in an atom</a>. For this reason, floating-point arithmetic is often used to allow very small and very large real numbers that require fast processing times. The result of this <a href="/wiki/Dynamic_range" title="Dynamic range">dynamic range</a> is that the numbers that can be represented are not uniformly spaced; the difference between two consecutive representable numbers varies with their exponent.<sup id="cite_ref-Smith_1997_3-0" class="reference"><a href="#cite_note-Smith_1997-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup> </p> <figure class="mw-halign-right" typeof="mw:File/Thumb"><a href="/wiki/File:A_number_line_representing_single-precision_floating_point%27s_numbers_and_numbers_that_it_cannot_display.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/9/98/A_number_line_representing_single-precision_floating_point%27s_numbers_and_numbers_that_it_cannot_display.png/500px-A_number_line_representing_single-precision_floating_point%27s_numbers_and_numbers_that_it_cannot_display.png" decoding="async" width="500" height="93" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/98/A_number_line_representing_single-precision_floating_point%27s_numbers_and_numbers_that_it_cannot_display.png/960px-A_number_line_representing_single-precision_floating_point%27s_numbers_and_numbers_that_it_cannot_display.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/98/A_number_line_representing_single-precision_floating_point%27s_numbers_and_numbers_that_it_cannot_display.png/1000px-A_number_line_representing_single-precision_floating_point%27s_numbers_and_numbers_that_it_cannot_display.png 2x" data-file-width="1814" data-file-height="337" /></a><figcaption>Single-precision floating-point numbers on a <a href="/wiki/Number_line" title="Number line">number line</a>: the green lines mark representable values.</figcaption></figure> <figure class="mw-halign-right" typeof="mw:File/Thumb"><a href="/wiki/File:FloatingPointPrecisionAugmented.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b6/FloatingPointPrecisionAugmented.png/500px-FloatingPointPrecisionAugmented.png" decoding="async" width="500" height="18" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b6/FloatingPointPrecisionAugmented.png/750px-FloatingPointPrecisionAugmented.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/b/b6/FloatingPointPrecisionAugmented.png/1000px-FloatingPointPrecisionAugmented.png 2x" data-file-width="2116" data-file-height="78" /></a><figcaption>Augmented version above showing both <a href="/wiki/Signed_number_representations" title="Signed number representations">signs</a> of representable values</figcaption></figure> <p>Over the years, a variety of floating-point representations have been used in computers. In 1985, the <a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a> Standard for Floating-Point Arithmetic was established, and since the 1990s, the most commonly encountered representations are those defined by the IEEE. </p><p>The speed of floating-point operations, commonly measured in terms of <a href="/wiki/FLOPS" class="mw-redirect" title="FLOPS">FLOPS</a>, is an important characteristic of a <a href="/wiki/Computer_system" class="mw-redirect" title="Computer system">computer system</a>, especially for applications that involve intensive mathematical calculations. </p><p>A <a href="/wiki/Floating-point_unit" title="Floating-point unit">floating-point unit</a> (FPU, colloquially a math <a href="/wiki/Coprocessor" title="Coprocessor">coprocessor</a>) is a part of a computer system specially designed to carry out operations on floating-point numbers. </p> <meta property="mw:PageProp/toc" /> <div class="mw-heading mw-heading2"><h2 id="Overview">Overview</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=1" title="Edit section: Overview"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Floating-point_numbers">Floating-point numbers</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=2" title="Edit section: Floating-point numbers"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A <a href="/wiki/Number_representation" class="mw-redirect" title="Number representation">number representation</a> specifies some way of encoding a number, usually as a string of digits. </p><p>There are several mechanisms by which strings of digits can represent numbers. In standard mathematical notation, the digit string can be of any length, and the location of the <a href="/wiki/Radix_point" class="mw-redirect" title="Radix point">radix point</a> is indicated by placing an explicit <a href="/wiki/Decimal_separator" title="Decimal separator">"point" character</a> (dot or comma) there. If the radix point is not specified, then the string implicitly represents an <a href="/wiki/Integer" title="Integer">integer</a> and the unstated radix point would be off the right-hand end of the string, next to the least significant digit. In <a href="/wiki/Fixed-point_arithmetic" title="Fixed-point arithmetic">fixed-point</a> systems, a position in the string is specified for the radix point. So a fixed-point scheme might use a string of 8 decimal digits with the decimal point in the middle, whereby "00012345" would represent 0001.2345. </p><p>In <a href="/wiki/Scientific_notation" title="Scientific notation">scientific notation</a>, the given number is scaled by a <a href="/wiki/Power_of_10" title="Power of 10">power of 10</a>, so that it lies within a specific range—typically between 1 and 10, with the radix point appearing immediately after the first digit. As a power of ten, the scaling factor is then indicated separately at the end of the number. For example, the orbital period of <a href="/wiki/Jupiter" title="Jupiter">Jupiter</a>'s moon <a href="/wiki/Io_(moon)" title="Io (moon)">Io</a> is <span class="nowrap"><span data-sort-value="7005152853504700000♠"></span>152,853.5047</span> seconds, a value that would be represented in standard-form scientific notation as <span class="nowrap"><span data-sort-value="7005152853504700000♠"></span>1.528535047<span style="margin-left:0.25em;margin-right:0.15em;">×</span>10<sup>5</sup></span> seconds. </p><p>Floating-point representation is similar in concept to scientific notation. Logically, a floating-point number consists of: </p> <ul><li>A signed (meaning positive or negative) digit string of a given length in a given <a href="/wiki/Radix" title="Radix">radix</a> (or base). This digit string is referred to as the <i><a href="/wiki/Significand" title="Significand">significand</a></i>, <i>mantissa</i>, or <i>coefficient</i>.<sup id="cite_ref-NB_Significand_4-0" class="reference"><a href="#cite_note-NB_Significand-4"><span class="cite-bracket">[</span>nb 1<span class="cite-bracket">]</span></a></sup> The length of the significand determines the <i>precision</i> to which numbers can be represented. The radix point position is assumed always to be somewhere within the significand—often just after or just before the most significant digit, or to the right of the rightmost (least significant) digit. This article generally follows the convention that the radix point is set just after the most significant (leftmost) digit.</li> <li>A signed integer <a href="/wiki/Exponent" class="mw-redirect" title="Exponent">exponent</a> (also referred to as the <i>characteristic</i>, or <i>scale</i>),<sup id="cite_ref-NB_Exponent_5-0" class="reference"><a href="#cite_note-NB_Exponent-5"><span class="cite-bracket">[</span>nb 2<span class="cite-bracket">]</span></a></sup> which modifies the magnitude of the number.</li></ul> <p>To derive the value of the floating-point number, the <i>significand</i> is multiplied by the <i>base</i> raised to the power of the <i>exponent</i>, equivalent to shifting the radix point from its implied position by a number of places equal to the value of the exponent—to the right if the exponent is positive or to the left if the exponent is negative. </p><p>Using base-10 (the familiar <a href="/wiki/Decimal_representation" title="Decimal representation">decimal</a> notation) as an example, the number <span class="nowrap"><span data-sort-value="7005152853504700000♠"></span>152,853.5047</span>, which has ten decimal digits of precision, is represented as the significand <span class="nowrap"><span data-sort-value="7009152853504700000♠"></span>1,528,535,047</span> together with 5 as the exponent. To determine the actual value, a decimal point is placed after the first digit of the significand and the result is multiplied by 10<sup><span class="nowrap"><span data-sort-value="7000500000000000000♠"></span>5</span></sup> to give <span class="nowrap"><span data-sort-value="7005152853504700000♠"></span>1.528535047<span style="margin-left:0.25em;margin-right:0.15em;">×</span>10<sup>5</sup></span>, or <span class="nowrap"><span data-sort-value="7005152853504700000♠"></span>152,853.5047</span>. In storing such a number, the base (10) need not be stored, since it will be the same for the entire range of supported numbers, and can thus be inferred. </p><p>Symbolically, this final value is: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\frac {s}{b^{\,p-1}}}\times b^{e},}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mi>s</mi> <msup> <mi>b</mi> <mrow class="MJX-TeXAtom-ORD"> <mspace width="thinmathspace"></mspace> <mi>p</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msup> </mfrac> </mrow> <mo>×<!-- × --></mo> <msup> <mi>b</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>e</mi> </mrow> </msup> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\frac {s}{b^{\,p-1}}}\times b^{e},}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/86a81cce555673e1074b92fb4867de5dc050d9e6" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.171ex; width:10.864ex; height:5.009ex;" alt="{\displaystyle {\frac {s}{b^{\,p-1}}}\times b^{e},}" /></span> </p><p>where <span class="texhtml mvar" style="font-style:italic;">s</span> is the significand (ignoring any implied decimal point), <span class="texhtml mvar" style="font-style:italic;">p</span> is the precision (the number of digits in the significand), <span class="texhtml mvar" style="font-style:italic;">b</span> is the base (in our example, this is the number <i>ten</i>), and <span class="texhtml mvar" style="font-style:italic;">e</span> is the exponent. </p><p><span class="anchor" id="Base-4"></span><span class="anchor" id="Base-8"></span><span class="anchor" id="Base-256"></span><span class="anchor" id="Base-65536"></span>Historically, several number bases have been used for representing floating-point numbers, with base two (<a href="/wiki/Binary_numeral_system" class="mw-redirect" title="Binary numeral system">binary</a>) being the most common, followed by base ten (<a href="/wiki/Decimal_floating_point" title="Decimal floating point">decimal floating point</a>), and other less common varieties, such as base sixteen (<a href="/wiki/Hexadecimal_floating_point" title="Hexadecimal floating point">hexadecimal floating point</a><sup id="cite_ref-Zehendner_2008_6-0" class="reference"><a href="#cite_note-Zehendner_2008-6"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Beebe_2017_7-0" class="reference"><a href="#cite_note-Beebe_2017-7"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-NB_9_8-0" class="reference"><a href="#cite_note-NB_9-8"><span class="cite-bracket">[</span>nb 3<span class="cite-bracket">]</span></a></sup>), base eight (octal floating point<sup id="cite_ref-Muller_2010_1-2" class="reference"><a href="#cite_note-Muller_2010-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Beebe_2017_7-1" class="reference"><a href="#cite_note-Beebe_2017-7"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Savard_2018_9-0" class="reference"><a href="#cite_note-Savard_2018-9"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Zehendner_2008_6-1" class="reference"><a href="#cite_note-Zehendner_2008-6"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-NB_8_10-0" class="reference"><a href="#cite_note-NB_8-10"><span class="cite-bracket">[</span>nb 4<span class="cite-bracket">]</span></a></sup>), base four (quaternary floating point<sup id="cite_ref-Parkinson_2000_11-0" class="reference"><a href="#cite_note-Parkinson_2000-11"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Beebe_2017_7-2" class="reference"><a href="#cite_note-Beebe_2017-7"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-NB_11_12-0" class="reference"><a href="#cite_note-NB_11-12"><span class="cite-bracket">[</span>nb 5<span class="cite-bracket">]</span></a></sup>), base three (<a href="/wiki/Balanced_ternary_floating_point" class="mw-redirect" title="Balanced ternary floating point">balanced ternary floating point</a><sup id="cite_ref-Muller_2010_1-3" class="reference"><a href="#cite_note-Muller_2010-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup>) and even base 256<sup id="cite_ref-Beebe_2017_7-3" class="reference"><a href="#cite_note-Beebe_2017-7"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-NB_12_13-0" class="reference"><a href="#cite_note-NB_12-13"><span class="cite-bracket">[</span>nb 6<span class="cite-bracket">]</span></a></sup> and base <span class="nowrap"><span data-sort-value="7004655360000000000♠"></span>65,536</span>.<sup id="cite_ref-Lazarus_1956_14-0" class="reference"><a href="#cite_note-Lazarus_1956-14"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-NB_10_15-0" class="reference"><a href="#cite_note-NB_10-15"><span class="cite-bracket">[</span>nb 7<span class="cite-bracket">]</span></a></sup> </p><p>A floating-point number is a <a href="/wiki/Rational_number" title="Rational number">rational number</a>, because it can be represented as one integer divided by another; for example <span class="nowrap"><span data-sort-value="7003145000000000000♠"></span>1.45<span style="margin-left:0.25em;margin-right:0.15em;">×</span>10<sup>3</sup></span> is (145/100)×1000 or <span class="nowrap"><span data-sort-value="7005145000000000000♠"></span>145,000</span>/100. The base determines the fractions that can be represented; for instance, 1/5 cannot be represented exactly as a floating-point number using a binary base, but 1/5 can be represented exactly using a decimal base (<span class="nowrap"><span data-sort-value="6999200000000000000♠"></span>0.2</span>, or <span class="nowrap"><span data-sort-value="6999200000000000000♠"></span>2<span style="margin-left:0.25em;margin-right:0.15em;">×</span>10<sup>−1</sup></span>). However, 1/3 cannot be represented exactly by either binary (0.010101...) or decimal (0.333...), but in <a href="/wiki/Ternary_numeral_system" title="Ternary numeral system">base 3</a>, it is trivial (0.1 or 1×3<sup>−1</sup>) . The occasions on which infinite expansions occur <a href="/wiki/Positional_notation#Infinite_representations" title="Positional notation">depend on the base and its prime factors</a>. </p><p>The way in which the significand (including its sign) and exponent are stored in a computer is implementation-dependent. The common IEEE formats are described in detail later and elsewhere, but as an example, in the binary single-precision (32-bit) floating-point representation, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle p=24}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>p</mi> <mo>=</mo> <mn>24</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle p=24}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e7a1f78962b615188b35552e3a5c12c49edd7192" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; margin-left: -0.089ex; width:6.682ex; height:2.509ex;" alt="{\displaystyle p=24}" /></span>, and so the significand is a string of 24 <a href="/wiki/Bit" title="Bit">bits</a>. For instance, the number <a href="/wiki/Pi" title="Pi">π</a>'s first 33 bits are: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 11001001\ 00001111\ 1101101{\underline {0}}\ 10100010\ 0.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>11001001</mn> <mtext> </mtext> <mn>00001111</mn> <mtext> </mtext> <mn>1101101</mn> <mrow class="MJX-TeXAtom-ORD"> <munder> <mn>0</mn> <mo>_<!-- _ --></mo> </munder> </mrow> <mtext> </mtext> <mn>10100010</mn> <mtext> </mtext> <mn>0.</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 11001001\ 00001111\ 1101101{\underline {0}}\ 10100010\ 0.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/303a069975e556333e66839b929b63c1c618ac84" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.562ex; margin-bottom: -0.776ex; width:41.33ex; height:3.176ex;" alt="{\displaystyle 11001001\ 00001111\ 1101101{\underline {0}}\ 10100010\ 0.}" /></span> </p><p>In this binary expansion, let us denote the positions from 0 (leftmost bit, or most significant bit) to 32 (rightmost bit). The 24-bit significand will stop at position 23, shown as the underlined bit <span class="nowrap"><span data-sort-value="5000000000000000000♠"></span>0</span> above. The next bit, at position 24, is called the <i>round bit</i> or <i>rounding bit</i>. It is used to round the 33-bit approximation to the nearest 24-bit number (there are <a href="/wiki/Rounding#Tie-breaking" title="Rounding">specific rules for halfway values</a>, which is not the case here). This bit, which is <span class="nowrap"><span data-sort-value="7000100000000000000♠"></span>1</span> in this example, is added to the integer formed by the leftmost 24 bits, yielding: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 11001001\ 00001111\ 1101101{\underline {1}}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>11001001</mn> <mtext> </mtext> <mn>00001111</mn> <mtext> </mtext> <mn>1101101</mn> <mrow class="MJX-TeXAtom-ORD"> <munder> <mn>1</mn> <mo>_<!-- _ --></mo> </munder> </mrow> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 11001001\ 00001111\ 1101101{\underline {1}}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/2a95f70a59ffb71883524e67a8084da07a1ffa3e" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.511ex; margin-bottom: -0.827ex; width:29.707ex; height:3.176ex;" alt="{\displaystyle 11001001\ 00001111\ 1101101{\underline {1}}.}" /></span> </p><p>When this is stored in memory using the IEEE 754 encoding, this becomes the <a href="/wiki/Significand" title="Significand">significand</a> <span class="texhtml mvar" style="font-style:italic;">s</span>. The significand is assumed to have a binary point to the right of the leftmost bit. So, the binary representation of π is calculated from left-to-right as follows: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}&\left(\sum _{n=0}^{p-1}{\text{bit}}_{n}\times 2^{-n}\right)\times 2^{e}\\={}&\left(1\times 2^{-0}+1\times 2^{-1}+0\times 2^{-2}+0\times 2^{-3}+1\times 2^{-4}+\cdots +1\times 2^{-23}\right)\times 2^{1}\\\approx {}&1.57079637\times 2\\\approx {}&3.1415927\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd></mtd> <mtd> <mrow> <mo>(</mo> <mrow> <munderover> <mo>∑<!-- ∑ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> <mo>=</mo> <mn>0</mn> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>p</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </munderover> <msub> <mrow class="MJX-TeXAtom-ORD"> <mtext>bit</mtext> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mi>n</mi> </mrow> </msup> </mrow> <mo>)</mo> </mrow> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mi>e</mi> </mrow> </msup> </mtd> </mtr> <mtr> <mtd> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> </mrow> </mtd> <mtd> <mrow> <mo>(</mo> <mrow> <mn>1</mn> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>0</mn> </mrow> </msup> <mo>+</mo> <mn>1</mn> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msup> <mo>+</mo> <mn>0</mn> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>2</mn> </mrow> </msup> <mo>+</mo> <mn>0</mn> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>3</mn> </mrow> </msup> <mo>+</mo> <mn>1</mn> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>4</mn> </mrow> </msup> <mo>+</mo> <mo>⋯<!-- ⋯ --></mo> <mo>+</mo> <mn>1</mn> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mn>23</mn> </mrow> </msup> </mrow> <mo>)</mo> </mrow> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msup> </mtd> </mtr> <mtr> <mtd> <mo>≈<!-- ≈ --></mo> <mrow class="MJX-TeXAtom-ORD"> </mrow> </mtd> <mtd> <mn>1.57079637</mn> <mo>×<!-- × --></mo> <mn>2</mn> </mtd> </mtr> <mtr> <mtd> <mo>≈<!-- ≈ --></mo> <mrow class="MJX-TeXAtom-ORD"> </mrow> </mtd> <mtd> <mn>3.1415927</mn> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}&\left(\sum _{n=0}^{p-1}{\text{bit}}_{n}\times 2^{-n}\right)\times 2^{e}\\={}&\left(1\times 2^{-0}+1\times 2^{-1}+0\times 2^{-2}+0\times 2^{-3}+1\times 2^{-4}+\cdots +1\times 2^{-23}\right)\times 2^{1}\\\approx {}&1.57079637\times 2\\\approx {}&3.1415927\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/43c70fb4f7cc16f28fee926ed0d22b2db4e10b82" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -8.005ex; width:75.968ex; height:17.176ex;" alt="{\displaystyle {\begin{aligned}&\left(\sum _{n=0}^{p-1}{\text{bit}}_{n}\times 2^{-n}\right)\times 2^{e}\\={}&\left(1\times 2^{-0}+1\times 2^{-1}+0\times 2^{-2}+0\times 2^{-3}+1\times 2^{-4}+\cdots +1\times 2^{-23}\right)\times 2^{1}\\\approx {}&1.57079637\times 2\\\approx {}&3.1415927\end{aligned}}}" /></span> </p><p>where <span class="texhtml mvar" style="font-style:italic;">p</span> is the precision (<span class="nowrap"><span data-sort-value="7001240000000000000♠"></span>24</span> in this example), <span class="texhtml mvar" style="font-style:italic;">n</span> is the position of the bit of the significand from the left (starting at <span class="nowrap"><span data-sort-value="5000000000000000000♠"></span>0</span> and finishing at <span class="nowrap"><span data-sort-value="7001230000000000000♠"></span>23</span> here) and <span class="texhtml mvar" style="font-style:italic;">e</span> is the exponent (<span class="nowrap"><span data-sort-value="7000100000000000000♠"></span>1</span> in this example). </p><p><span class="anchor" id="Hidden_bit"></span>It can be required that the most significant digit of the significand of a non-zero number be non-zero (except when the corresponding exponent would be smaller than the minimum one). This process is called <i>normalization</i>. For binary formats (which uses only the digits <span class="nowrap"><span data-sort-value="5000000000000000000♠"></span>0</span> and <span class="nowrap"><span data-sort-value="7000100000000000000♠"></span>1</span>), this non-zero digit is necessarily <span class="nowrap"><span data-sort-value="7000100000000000000♠"></span>1</span>. Therefore, it does not need to be represented in memory, allowing the format to have one more bit of precision. This rule is variously called the <i>leading bit convention</i>, the <i>implicit bit convention</i>, the <i>hidden bit convention</i>,<sup id="cite_ref-Muller_2010_1-4" class="reference"><a href="#cite_note-Muller_2010-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> or the <i>assumed bit convention</i>. </p> <div class="mw-heading mw-heading3"><h3 id="Alternatives_to_floating-point_numbers">Alternatives to floating-point numbers</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=3" title="Edit section: Alternatives to floating-point numbers"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The floating-point representation is by far the most common way of representing in computers an approximation to real numbers. However, there are alternatives: </p> <ul><li><a href="/wiki/Fixed-point_arithmetic" title="Fixed-point arithmetic">Fixed-point</a> representation uses integer hardware operations controlled by a software implementation of a specific convention about the location of the binary or decimal point, for example, 6 bits or digits from the right. The hardware to manipulate these representations is less costly than floating point, and it can be used to perform normal integer operations, too. Binary fixed point is usually used in special-purpose applications on embedded processors that can only do integer arithmetic, but decimal fixed point is common in commercial applications.</li> <li><a href="/wiki/Logarithmic_number_system" title="Logarithmic number system">Logarithmic number systems</a> (LNSs) represent a real number by the logarithm of its absolute value and a sign bit. The value distribution is similar to floating point, but the value-to-representation curve (<i>i.e.</i>, the graph of the logarithm function) is smooth (except at 0). Conversely to floating-point arithmetic, in a logarithmic number system multiplication, division and exponentiation are simple to implement, but addition and subtraction are complex. The (<a href="/wiki/Symmetric_level-index_arithmetic" title="Symmetric level-index arithmetic">symmetric</a>) <a href="/wiki/Level-index_arithmetic" class="mw-redirect" title="Level-index arithmetic">level-index arithmetic</a> (LI and SLI) of Charles Clenshaw, <a href="/wiki/Frank_William_John_Olver" class="mw-redirect" title="Frank William John Olver">Frank Olver</a> and Peter Turner is a scheme based on a <a href="/wiki/Generalized_logarithm" class="mw-redirect" title="Generalized logarithm">generalized logarithm</a> representation.</li> <li><a href="/wiki/Tapered_floating-point_representation" class="mw-redirect" title="Tapered floating-point representation">Tapered floating-point representation</a>, used in <a href="/wiki/Unum_(number_format)" title="Unum (number format)">Unum</a>.</li> <li>Some simple rational numbers (<i>e.g.</i>, 1/3 and 1/10) cannot be represented exactly in binary floating point, no matter what the precision is. Using a different radix allows one to represent some of them (<i>e.g.</i>, 1/10 in decimal floating point), but the possibilities remain limited. Software packages that perform <a href="/wiki/Fraction" title="Fraction">rational arithmetic</a> represent numbers as fractions with integral numerator and denominator, and can therefore represent any rational number exactly. Such packages generally need to use "<a href="/wiki/Bignum" class="mw-redirect" title="Bignum">bignum</a>" arithmetic for the individual integers.</li> <li><a href="/wiki/Interval_arithmetic" title="Interval arithmetic">Interval arithmetic</a> allows one to represent numbers as intervals and obtain guaranteed bounds on results. It is generally based on other arithmetics, in particular floating point.</li> <li><a href="/wiki/Computer_algebra_system" title="Computer algebra system">Computer algebra systems</a> such as <a href="/wiki/Mathematica" class="mw-redirect" title="Mathematica">Mathematica</a>, <a href="/wiki/Maxima_(software)" title="Maxima (software)">Maxima</a>, and <a href="/wiki/Maple_(software)" title="Maple (software)">Maple</a> can often handle irrational numbers like <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9be4ba0bb8df3af72e90a0535fabcc17431e540a" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.332ex; height:1.676ex;" alt="{\displaystyle \pi }" /></span> or <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\sqrt {3}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <mn>3</mn> </msqrt> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\sqrt {3}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3b19c09494138b5082459afac7f9a8d99c546fcd" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:3.098ex; height:2.843ex;" alt="{\displaystyle {\sqrt {3}}}" /></span> in a completely "formal" way (<a href="/wiki/Symbolic_computation" class="mw-redirect" title="Symbolic computation">symbolic computation</a>), without dealing with a specific encoding of the significand. Such a program can evaluate expressions like "<span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \sin(3\pi )}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>sin</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mn>3</mn> <mi>π<!-- π --></mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \sin(3\pi )}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7d239f0e06c5d96ed18c3add618e47f27b3534b3" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:7.159ex; height:2.843ex;" alt="{\displaystyle \sin(3\pi )}" /></span>" exactly, because it is programmed to process the underlying mathematics directly, instead of using approximate values for each intermediate calculation.</li></ul> <div class="mw-heading mw-heading2"><h2 id="History">History</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=4" title="Edit section: History"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951" /><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/IEEE_754#History" title="IEEE 754">IEEE 754 § History</a></div> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Quevedo_1917.jpg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c0/Quevedo_1917.jpg/150px-Quevedo_1917.jpg" decoding="async" width="150" height="157" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c0/Quevedo_1917.jpg/225px-Quevedo_1917.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/c0/Quevedo_1917.jpg/300px-Quevedo_1917.jpg 2x" data-file-width="831" data-file-height="872" /></a><figcaption><a href="/wiki/Leonardo_Torres_Quevedo" title="Leonardo Torres Quevedo">Leonardo Torres Quevedo</a>, in 1914, published an analysis of floating point based on the <a href="/wiki/Analytical_engine" title="Analytical engine">analytical engine</a>.</figcaption></figure> <p>In 1914, the Spanish engineer <a href="/wiki/Leonardo_Torres_Quevedo" title="Leonardo Torres Quevedo">Leonardo Torres Quevedo</a> published <i>Essays on Automatics</i>,<sup id="cite_ref-16" class="reference"><a href="#cite_note-16"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup> where he designed a special-purpose electromechanical calculator based on <a href="/wiki/Charles_Babbage" title="Charles Babbage">Charles Babbage</a>'s <a href="/wiki/Analytical_engine" title="Analytical engine">analytical engine</a> and described a way to store floating-point numbers in a consistent manner. He stated that numbers will be stored in exponential format as <i>n</i> x 10<span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle ^{m}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>m</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle ^{m}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a8c4d272ab903e44501a5ac0aff0c8f2b0fcf611" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.171ex; width:1.675ex; height:2.176ex;" alt="{\displaystyle ^{m}}" /></span>, and offered three rules by which consistent manipulation of floating-point numbers by machines could be implemented. For Torres, "<i>n</i> will always be the same number of <a href="/wiki/Numerical_digit" title="Numerical digit">digits</a> (e.g. six), the first digit of <i>n</i> will be of order of tenths, the second of hundredths, etc, and one will write each quantity in the form: <i>n</i>; <i>m</i>." The format he proposed shows the need for a fixed-sized significand as is presently used for floating-point data, fixing the location of the decimal point in the significand so that each representation was unique, and how to format such numbers by specifying a syntax to be used that could be entered through a <a href="/wiki/Typewriter" title="Typewriter">typewriter</a>, as was the case of his <a href="/wiki/Leonardo_Torres_y_Quevedo#Analytical_machines" class="mw-redirect" title="Leonardo Torres y Quevedo">Electromechanical Arithmometer</a> in 1920.<sup id="cite_ref-17" class="reference"><a href="#cite_note-17"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-FOOTNOTERandell19826,_11–13_18-0" class="reference"><a href="#cite_note-FOOTNOTERandell19826,_11–13-18"><span class="cite-bracket">[</span>11<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-19" class="reference"><a href="#cite_note-19"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup> </p> <figure class="mw-default-size mw-halign-right" typeof="mw:File/Thumb"><a href="/wiki/File:Konrad_Zuse_(1992).jpg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/d/da/Konrad_Zuse_%281992%29.jpg/150px-Konrad_Zuse_%281992%29.jpg" decoding="async" width="150" height="200" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/d/da/Konrad_Zuse_%281992%29.jpg/225px-Konrad_Zuse_%281992%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/d/da/Konrad_Zuse_%281992%29.jpg/300px-Konrad_Zuse_%281992%29.jpg 2x" data-file-width="354" data-file-height="472" /></a><figcaption><a href="/wiki/Konrad_Zuse" title="Konrad Zuse">Konrad Zuse</a>, architect of the <a href="/wiki/Z3_(computer)" title="Z3 (computer)">Z3</a> computer, which uses a 22-bit binary floating-point representation</figcaption></figure> <p>In 1938, <a href="/wiki/Konrad_Zuse" title="Konrad Zuse">Konrad Zuse</a> of Berlin completed the <a href="/wiki/Z1_(computer)" title="Z1 (computer)">Z1</a>, the first binary, programmable <a href="/wiki/Mechanical_computer" title="Mechanical computer">mechanical computer</a>;<sup id="cite_ref-Rojas_1997_20-0" class="reference"><a href="#cite_note-Rojas_1997-20"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup> it uses a 24-bit binary floating-point number representation with a 7-bit signed exponent, a 17-bit significand (including one implicit bit), and a sign bit.<sup id="cite_ref-Rojas_2014_21-0" class="reference"><a href="#cite_note-Rojas_2014-21"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup> The more reliable <a href="/wiki/Relay" title="Relay">relay</a>-based <a href="/wiki/Z3_(computer)" title="Z3 (computer)">Z3</a>, completed in 1941, has representations for both positive and negative infinities; in particular, it implements defined operations with infinity, such as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle ^{1}/_{\infty }=0}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msup> <msub> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mrow> </msub> <mo>=</mo> <mn>0</mn> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle ^{1}/_{\infty }=0}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/23ef4d558c2fedd08aaaa744e83c7d36dd85cce6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:8.353ex; height:3.343ex;" alt="{\displaystyle ^{1}/_{\infty }=0}" /></span>, and it stops on undefined operations, such as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 0\times \infty }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>0</mn> <mo>×<!-- × --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 0\times \infty }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/6c2c67d872e7859a5b51d652639651d1e1384df0" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:6.327ex; height:2.176ex;" alt="{\displaystyle 0\times \infty }" /></span>. </p><p>Zuse also proposed, but did not complete, carefully rounded floating-point arithmetic that includes <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pm \infty }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo>±<!-- ± --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pm \infty }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c586ae37f8efec026b8a4ea3f6a5253576c2c4e6" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:4.132ex; height:2.176ex;" alt="{\displaystyle \pm \infty }" /></span> and NaN representations, anticipating features of the IEEE Standard by four decades.<sup id="cite_ref-Kahan_1997_JVNL_22-0" class="reference"><a href="#cite_note-Kahan_1997_JVNL-22"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup> In contrast, <a href="/wiki/John_von_Neumann" title="John von Neumann">von Neumann</a> recommended against floating-point numbers for the 1951 <a href="/wiki/IAS_machine" title="IAS machine">IAS machine</a>, arguing that fixed-point arithmetic is preferable.<sup id="cite_ref-Kahan_1997_JVNL_22-1" class="reference"><a href="#cite_note-Kahan_1997_JVNL-22"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup> </p><p>The first <i>commercial</i> computer with floating-point hardware was Zuse's <a href="/wiki/Z4_(computer)" title="Z4 (computer)">Z4</a> computer, designed in 1942–1945. In 1946, Bell Laboratories introduced the <a href="/wiki/Model_V" title="Model V">Model V</a>, which implemented <a href="/wiki/Decimal_floating_point" title="Decimal floating point">decimal floating-point numbers</a>.<sup id="cite_ref-Randell_1982_2_23-0" class="reference"><a href="#cite_note-Randell_1982_2-23"><span class="cite-bracket">[</span>16<span class="cite-bracket">]</span></a></sup> </p><p>The <a href="/wiki/Pilot_ACE" title="Pilot ACE">Pilot ACE</a> has binary floating-point arithmetic, and it became operational in 1950 at <a href="/wiki/National_Physical_Laboratory,_UK" class="mw-redirect" title="National Physical Laboratory, UK">National Physical Laboratory, UK</a>. Thirty-three were later sold commercially as the <a href="/wiki/English_Electric_DEUCE" title="English Electric DEUCE">English Electric DEUCE</a>. The arithmetic is actually implemented in software, but with a one megahertz clock rate, the speed of floating-point and fixed-point operations in this machine were initially faster than those of many competing computers. </p><p>The mass-produced <a href="/wiki/IBM_704" title="IBM 704">IBM 704</a> followed in 1954; it introduced the use of a <a href="/wiki/Exponent_bias" title="Exponent bias">biased exponent</a>. For many decades after that, floating-point hardware was typically an optional feature, and computers that had it were said to be "scientific computers", or to have "<a href="/wiki/Scientific_computation" class="mw-redirect" title="Scientific computation">scientific computation</a>" (SC) capability (see also <a href="/wiki/Extensions_for_Scientific_Computation" class="mw-redirect" title="Extensions for Scientific Computation">Extensions for Scientific Computation</a> (XSC)). It was not until the launch of the Intel i486 in 1989 that <i>general-purpose</i> personal computers had floating-point capability in hardware as a standard feature. </p><p>The <a href="/wiki/UNIVAC_1100/2200_series" title="UNIVAC 1100/2200 series">UNIVAC 1100/2200 series</a>, introduced in 1962, supported two floating-point representations: </p> <ul><li><i>Single precision</i>: 36 bits, organized as a 1-bit sign, an 8-bit exponent, and a 27-bit significand.</li> <li><i>Double precision</i>: 72 bits, organized as a 1-bit sign, an 11-bit exponent, and a 60-bit significand.</li></ul> <p>The <a href="/wiki/IBM_7094" class="mw-redirect" title="IBM 7094">IBM 7094</a>, also introduced in 1962, supported single-precision and double-precision representations, but with no relation to the UNIVAC's representations. Indeed, in 1964, IBM introduced <a href="/wiki/IBM_hexadecimal_floating-point" title="IBM hexadecimal floating-point">hexadecimal floating-point representations</a> in its <a href="/wiki/System/360" class="mw-redirect" title="System/360">System/360</a> mainframes; these same representations are still available for use in modern <a href="/wiki/Z/Architecture" title="Z/Architecture">z/Architecture</a> systems. In 1998, IBM implemented IEEE-compatible binary floating-point arithmetic in its mainframes; in 2005, IBM also added IEEE-compatible decimal floating-point arithmetic. </p><p>Initially, computers used many different representations for floating-point numbers. The lack of standardization at the mainframe level was an ongoing problem by the early 1970s for those writing and maintaining higher-level source code; these manufacturer floating-point standards differed in the word sizes, the representations, and the rounding behavior and general accuracy of operations. Floating-point compatibility across multiple computing systems was in desperate need of standardization by the early 1980s, leading to the creation of the <a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a> standard once the 32-bit (or 64-bit) <a href="/wiki/Word_(computer_architecture)" title="Word (computer architecture)">word</a> had become commonplace. This standard was significantly based on a proposal from Intel, which was designing the <a href="/wiki/Intel_8087" title="Intel 8087">i8087</a> numerical coprocessor; Motorola, which was designing the <a href="/wiki/68000" class="mw-redirect" title="68000">68000</a> around the same time, gave significant input as well. </p> <figure class="mw-default-size mw-halign-right" typeof="mw:File/Thumb"><a href="/wiki/File:William_Kahan_2008.jpg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/William_Kahan_2008.jpg/250px-William_Kahan_2008.jpg" decoding="async" width="150" height="106" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/William_Kahan_2008.jpg/330px-William_Kahan_2008.jpg 2x" data-file-width="3225" data-file-height="2287" /></a><figcaption><a href="/wiki/William_Kahan" title="William Kahan">William Kahan</a>, principal architect of the <a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a> floating-point standard</figcaption></figure> <p>In 1989, mathematician and computer scientist <a href="/wiki/William_Kahan" title="William Kahan">William Kahan</a> was honored with the <a href="/wiki/Turing_Award" title="Turing Award">Turing Award</a> for being the primary architect behind this proposal; he was aided by his student Jerome Coonen and a visiting professor, <a href="/wiki/Harold_S._Stone" title="Harold S. Stone">Harold Stone</a>.<sup id="cite_ref-Severance_1998_24-0" class="reference"><a href="#cite_note-Severance_1998-24"><span class="cite-bracket">[</span>17<span class="cite-bracket">]</span></a></sup> </p><p>Among the x86 innovations are these: </p> <ul><li>A precisely specified floating-point representation at the bit-string level, so that all compliant computers interpret bit patterns the same way. This makes it possible to accurately and efficiently transfer floating-point numbers from one computer to another (after accounting for <a href="/wiki/Endianness" title="Endianness">endianness</a>).</li> <li>A precisely specified behavior for the arithmetic operations: A result is required to be produced as if infinitely precise arithmetic were used to yield a value that is then rounded according to specific rules. This means that a compliant computer program would always produce the same result when given a particular input, thus mitigating the almost mystical reputation that floating-point computation had developed for its hitherto seemingly non-deterministic behavior.</li> <li>The ability of <a href="/wiki/IEEE_754#Exception_handling" title="IEEE 754">exceptional conditions</a> (overflow, <a href="/wiki/Division_by_zero" title="Division by zero">divide by zero</a>, etc.) to propagate through a computation in a benign manner and then be handled by the software in a controlled fashion.</li></ul> <div class="mw-heading mw-heading2"><h2 id="Range_of_floating-point_numbers">Range of floating-point numbers</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=5" title="Edit section: Range of floating-point numbers"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A floating-point number consists of two <a href="/wiki/Fixed-point_arithmetic" title="Fixed-point arithmetic">fixed-point</a> components, whose range depends exclusively on the number of bits or digits in their representation. Whereas components linearly depend on their range, the floating-point range linearly depends on the significand range and exponentially on the range of exponent component, which attaches outstandingly wider range to the number. </p><p>On a typical computer system, a <i><a href="/wiki/Double-precision_floating-point_format" title="Double-precision floating-point format">double-precision</a></i> (64-bit) binary floating-point number has a coefficient of 53 bits (including 1 implied bit), an exponent of 11 bits, and 1 sign bit. Since 2<sup>10</sup> = 1024, the complete range of the positive normal floating-point numbers in this format is from 2<sup>−1022</sup> ≈ 2 × 10<sup>−308</sup> to approximately 2<sup>1024</sup> ≈ 2 × 10<sup>308</sup>. </p><p>The number of normal floating-point numbers in a system (<i>B</i>, <i>P</i>, <i>L</i>, <i>U</i>) where </p> <ul><li><i>B</i> is the base of the system,</li> <li><i>P</i> is the precision of the significand (in base <i>B</i>),</li> <li><i>L</i> is the smallest exponent of the system,</li> <li><i>U</i> is the largest exponent of the system,</li></ul> <p>is <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 2\left(B-1\right)\left(B^{P-1}\right)\left(U-L+1\right)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>2</mn> <mrow> <mo>(</mo> <mrow> <mi>B</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> <mo>)</mo> </mrow> <mrow> <mo>(</mo> <msup> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>P</mi> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> </msup> <mo>)</mo> </mrow> <mrow> <mo>(</mo> <mrow> <mi>U</mi> <mo>−<!-- − --></mo> <mi>L</mi> <mo>+</mo> <mn>1</mn> </mrow> <mo>)</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 2\left(B-1\right)\left(B^{P-1}\right)\left(U-L+1\right)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/50a5e9a91a7338cf13ed00dfe9042b7aa0d8a79b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:29.379ex; height:3.343ex;" alt="{\displaystyle 2\left(B-1\right)\left(B^{P-1}\right)\left(U-L+1\right)}" /></span>. </p><p>There is a smallest positive normal floating-point number, </p> <dl><dd>Underflow level = UFL = <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle B^{L}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>L</mi> </mrow> </msup> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle B^{L}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/03c80b97f6b23ac47189791d0c01da71ab982531" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:3.116ex; height:2.676ex;" alt="{\displaystyle B^{L}}" /></span>,</dd></dl> <p>which has a 1 as the leading digit and 0 for the remaining digits of the significand, and the smallest possible value for the exponent. </p><p>There is a largest floating-point number, </p> <dl><dd>Overflow level = OFL = <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \left(1-B^{-P}\right)\left(B^{U+1}\right)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow> <mo>(</mo> <mrow> <mn>1</mn> <mo>−<!-- − --></mo> <msup> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>−<!-- − --></mo> <mi>P</mi> </mrow> </msup> </mrow> <mo>)</mo> </mrow> <mrow> <mo>(</mo> <msup> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>U</mi> <mo>+</mo> <mn>1</mn> </mrow> </msup> <mo>)</mo> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \left(1-B^{-P}\right)\left(B^{U+1}\right)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/89ef88d856c29d99a33aa71976c5827a32abf281" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.005ex; width:18.516ex; height:3.343ex;" alt="{\displaystyle \left(1-B^{-P}\right)\left(B^{U+1}\right)}" /></span>,</dd></dl> <p>which has <i>B</i> − 1 as the value for each digit of the significand and the largest possible value for the exponent. </p><p>In addition, there are representable values strictly between −UFL and UFL. Namely, <a href="/wiki/Signed_zero" title="Signed zero">positive and negative zeros</a>, as well as <a href="/wiki/Subnormal_number" title="Subnormal number">subnormal numbers</a>. </p> <div class="mw-heading mw-heading2"><h2 id="IEEE_754:_floating_point_in_modern_computers">IEEE 754: floating point in modern computers <span class="anchor" id="IEEE_754"></span></h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=6" title="Edit section: IEEE 754: floating point in modern computers"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951" /><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1126788409" /><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1246091330" /><table class="sidebar nomobile nowraplinks plainlist"><tbody><tr><th class="sidebar-title"><a class="mw-selflink selflink">Floating-point</a> <a href="/wiki/Computer_number_format" title="Computer number format">formats</a></th></tr><tr><th class="sidebar-heading"> <a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a></th></tr><tr><td class="sidebar-content"> <ul><li>16-bit: <a href="/wiki/Half-precision_floating-point_format" title="Half-precision floating-point format">Half</a> (binary16)</li> <li>32-bit: <a href="/wiki/Single-precision_floating-point_format" title="Single-precision floating-point format">Single</a> (binary32), <a href="/wiki/Decimal32_floating-point_format" title="Decimal32 floating-point format">decimal32</a></li> <li>64-bit: <a href="/wiki/Double-precision_floating-point_format" title="Double-precision floating-point format">Double</a> (binary64), <a href="/wiki/Decimal64_floating-point_format" title="Decimal64 floating-point format">decimal64</a></li> <li>128-bit: <a href="/wiki/Quadruple-precision_floating-point_format" title="Quadruple-precision floating-point format">Quadruple</a> (binary128), <a href="/wiki/Decimal128_floating-point_format" title="Decimal128 floating-point format">decimal128</a></li> <li>256-bit: <a href="/wiki/Octuple-precision_floating-point_format" title="Octuple-precision floating-point format">Octuple</a> (binary256)</li> <li><a href="/wiki/Extended_precision" title="Extended precision">Extended precision</a></li></ul></td> </tr><tr><th class="sidebar-heading"> Other</th></tr><tr><td class="sidebar-content"> <ul><li><a href="/wiki/Minifloat" title="Minifloat">Minifloat</a></li> <li><a href="/wiki/Bfloat16_floating-point_format" title="Bfloat16 floating-point format">bfloat16</a></li> <li><a href="/wiki/TensorFloat-32" title="TensorFloat-32">TensorFloat-32</a></li> <li><a href="/wiki/Microsoft_Binary_Format" title="Microsoft Binary Format">Microsoft Binary Format</a></li> <li><a href="/wiki/IBM_hexadecimal_floating-point" title="IBM hexadecimal floating-point">IBM floating-point architecture</a></li> <li><a href="/wiki/Power_Management_Bus#Linear11_Floating-Point_Format" title="Power Management Bus">PMBus Linear-11</a></li> <li><a href="/wiki/G.711" title="G.711">G.711 8-bit floats</a></li></ul></td> </tr><tr><th class="sidebar-heading"> Alternatives</th></tr><tr><td class="sidebar-content"> <ul><li><a href="/wiki/Arbitrary-precision_arithmetic" title="Arbitrary-precision arithmetic">Arbitrary precision</a></li></ul></td> </tr><tr><th class="sidebar-heading"> <a href="/wiki/Tapered_floating_point" title="Tapered floating point">Tapered floating point</a></th></tr><tr><td class="sidebar-content"> <ul><li><a href="/wiki/Unum_(number_format)" title="Unum (number format)">Posit</a></li></ul></td> </tr><tr><td class="sidebar-navbar"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231" /><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Floating-point" title="Template:Floating-point"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Floating-point" title="Template talk:Floating-point"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Floating-point" title="Special:EditPage/Template:Floating-point"><abbr title="Edit this template">e</abbr></a></li></ul></div></td></tr></tbody></table> <p>The <a href="/wiki/Institute_of_Electrical_and_Electronics_Engineers" title="Institute of Electrical and Electronics Engineers">IEEE</a> standardized the computer representation for binary floating-point numbers in <a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a> (a.k.a. IEC 60559) in 1985. This first standard is followed by almost all modern machines. It was <a href="/wiki/IEEE_754-2008_revision" title="IEEE 754-2008 revision">revised in 2008</a>. IBM mainframes support <a href="/wiki/IBM_hexadecimal_floating_point" class="mw-redirect" title="IBM hexadecimal floating point">IBM's own hexadecimal floating point format</a> and IEEE 754-2008 <a href="/wiki/Decimal_floating_point" title="Decimal floating point">decimal floating point</a> in addition to the IEEE 754 binary format. The <a href="/wiki/Cray_T90" title="Cray T90">Cray T90</a> series had an IEEE version, but the <a href="/wiki/Cray_SV1" title="Cray SV1">SV1</a> still uses Cray floating-point format.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (July 2020)">citation needed</span></a></i>]</sup> </p><p>The standard provides for many closely related formats, differing in only a few details. Five of these formats are called <i>basic formats</i>, and others are termed <i>extended precision formats</i> and <i>extendable precision format</i>. Three formats are especially widely used in computer hardware and languages:<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="Possibly wrong for double extended: OK for hardware, but for languages? Note that in C, long double may not correspond to double extended (see 32-bit ARM and PowerPC). (July 2020)">citation needed</span></a></i>]</sup> </p> <ul><li><a href="/wiki/Single-precision_floating-point_format" title="Single-precision floating-point format">Single precision</a> (binary32), usually used to represent the "float" <a href="/wiki/C_data_types#Basic_types" title="C data types">type in the C language</a> family. This is a binary format that occupies 32 bits (4 bytes) and its significand has a precision of 24 bits (about 7 decimal digits).</li> <li><a href="/wiki/Double-precision_floating-point_format" title="Double-precision floating-point format">Double precision</a> (binary64), usually used to represent the "double" <a href="/wiki/C_data_types#Basic_types" title="C data types">type in the C language</a> family. This is a binary format that occupies 64 bits (8 bytes) and its significand has a precision of 53 bits (about 16 decimal digits).</li> <li><a href="/wiki/Extended_precision" title="Extended precision">Double extended</a>, also ambiguously called "extended precision" format. This is a binary format that occupies at least 79 bits (80 if the hidden/implicit bit rule is not used) and its significand has a precision of at least 64 bits (about 19 decimal digits). The <a href="/wiki/C99" title="C99">C99</a> and <a href="/wiki/C11_(C_standard_revision)" title="C11 (C standard revision)">C11</a> standards of the C language family, in their annex F ("IEC 60559 floating-point arithmetic"), recommend such an extended format to be provided as "<a href="/wiki/Long_double" title="Long double">long double</a>".<sup id="cite_ref-C99_25-0" class="reference"><a href="#cite_note-C99-25"><span class="cite-bracket">[</span>18<span class="cite-bracket">]</span></a></sup> A format satisfying the minimal requirements (64-bit significand precision, 15-bit exponent, thus fitting on 80 bits) is provided by the <a href="/wiki/X86" title="X86">x86</a> architecture. Often on such processors, this format can be used with "long double", though extended precision is not available with MSVC.<sup id="cite_ref-MSVC_26-0" class="reference"><a href="#cite_note-MSVC-26"><span class="cite-bracket">[</span>19<span class="cite-bracket">]</span></a></sup> For <a href="/wiki/Data_structure_alignment" title="Data structure alignment">alignment</a> purposes, many tools store this 80-bit value in a 96-bit or 128-bit space.<sup id="cite_ref-GCC_27-0" class="reference"><a href="#cite_note-GCC-27"><span class="cite-bracket">[</span>20<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-float_128_28-0" class="reference"><a href="#cite_note-float_128-28"><span class="cite-bracket">[</span>21<span class="cite-bracket">]</span></a></sup> On other processors, "long double" may stand for a larger format, such as quadruple precision,<sup id="cite_ref-ARM_2013_AArch64_29-0" class="reference"><a href="#cite_note-ARM_2013_AArch64-29"><span class="cite-bracket">[</span>22<span class="cite-bracket">]</span></a></sup> or just double precision, if any form of extended precision is not available.<sup id="cite_ref-ARM_2013_Compiler_30-0" class="reference"><a href="#cite_note-ARM_2013_Compiler-30"><span class="cite-bracket">[</span>23<span class="cite-bracket">]</span></a></sup></li></ul> <p>Increasing the precision of the floating-point representation generally reduces the amount of accumulated <a href="/wiki/Round-off_error" title="Round-off error">round-off error</a> caused by intermediate calculations.<sup id="cite_ref-Kahan_2004_31-0" class="reference"><a href="#cite_note-Kahan_2004-31"><span class="cite-bracket">[</span>24<span class="cite-bracket">]</span></a></sup> Other IEEE formats include: </p> <ul><li><a href="/wiki/Decimal64_floating-point_format" title="Decimal64 floating-point format">Decimal64</a> and <a href="/wiki/Decimal128_floating-point_format" title="Decimal128 floating-point format">decimal128</a> floating-point formats. These formats (especially decimal128) are pervasive in financial transactions because, along with the <a href="/wiki/Decimal32_floating-point_format" title="Decimal32 floating-point format">decimal32</a> format, they allow correct decimal rounding.</li> <li><a href="/wiki/Quadruple-precision_floating-point_format#IEEE_754_quadruple-precision_binary_floating-point_format:_binary128" title="Quadruple-precision floating-point format">Quadruple precision</a> (binary128). This is a binary format that occupies 128 bits (16 bytes) and its significand has a precision of 113 bits (about 34 decimal digits).</li> <li><a href="/wiki/Half-precision_floating-point_format" title="Half-precision floating-point format">Half precision</a>, also called binary16, a 16-bit floating-point value. It is being used in the NVIDIA <a href="/wiki/Cg_(programming_language)" title="Cg (programming language)">Cg</a> graphics language, and in the openEXR standard (where it actually predates the introduction in the IEEE 754 standard).<sup id="cite_ref-OpenEXR_32-0" class="reference"><a href="#cite_note-OpenEXR-32"><span class="cite-bracket">[</span>25<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-OpenEXR-half_33-0" class="reference"><a href="#cite_note-OpenEXR-half-33"><span class="cite-bracket">[</span>26<span class="cite-bracket">]</span></a></sup></li></ul> <p>Any integer with absolute value less than 2<sup>24</sup> can be exactly represented in the single-precision format, and any integer with absolute value less than 2<sup>53</sup> can be exactly represented in the double-precision format. Furthermore, a wide range of powers of 2 times such a number can be represented. These properties are sometimes used for purely integer data, to get 53-bit integers on platforms that have double-precision floats but only 32-bit integers. </p><p>The standard specifies some special values, and their representation: positive <a href="/wiki/Infinity" title="Infinity">infinity</a> (<span class="texhtml">+∞</span>), negative infinity (<span class="texhtml">−∞</span>), a <a href="/wiki/Negative_zero" class="mw-redirect" title="Negative zero">negative zero</a> (−0) distinct from ordinary ("positive") zero, and "not a number" values (<a href="/wiki/NaN" title="NaN">NaNs</a>). </p><p>Comparison of floating-point numbers, as defined by the IEEE standard, is a bit different from usual integer comparison. Negative and positive zero compare equal, and every NaN compares unequal to every value, including itself. All finite floating-point numbers are strictly smaller than <span class="texhtml">+∞</span> and strictly greater than <span class="texhtml">−∞</span>, and they are ordered in the same way as their values (in the set of real numbers). </p> <div class="mw-heading mw-heading3"><h3 id="Internal_representation">Internal representation</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=7" title="Edit section: Internal representation"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Floating-point numbers are typically packed into a computer datum as the sign bit, the exponent field, and the significand or mantissa, from left to right. For the IEEE 754 binary formats (basic and extended) which have extant hardware implementations, they are apportioned as follows: </p> <table class="wikitable" style="text-align:right; border:0"> <tbody><tr> <th rowspan="2">Type </th> <th colspan="4">Bits </th> <td rowspan="7" style="background:white; border:0"> </td> <th rowspan="2">Exponent<br />bias </th> <th rowspan="2">Bits<br />precision </th> <th rowspan="2">Number of<br />decimal digits </th></tr> <tr> <th>Sign </th> <th>Exponent </th> <th>Significand </th> <th>Total </th></tr> <tr> <td><a href="/wiki/Half_precision" class="mw-redirect" title="Half precision">Half</a> (<a href="/wiki/IEEE_floating_point" class="mw-redirect" title="IEEE floating point">IEEE 754-2008</a>) </td> <td>1 </td> <td>5 </td> <td>10 </td> <td>16 </td> <td>15 </td> <td>11 </td> <td>~3.3 </td></tr> <tr> <td><a href="/wiki/Single_precision" class="mw-redirect" title="Single precision">Single</a> </td> <td>1 </td> <td>8 </td> <td>23 </td> <td>32 </td> <td>127 </td> <td>24 </td> <td>~7.2 </td></tr> <tr> <td><a href="/wiki/Double_precision" class="mw-redirect" title="Double precision">Double</a> </td> <td>1 </td> <td>11 </td> <td>52 </td> <td>64 </td> <td>1023 </td> <td>53 </td> <td>~15.9 </td></tr> <tr> <td><a href="/wiki/Extended_precision#x86_extended-precision_format" title="Extended precision">x86 extended precision</a> </td> <td>1 </td> <td>15 </td> <td>64 </td> <td>80 </td> <td>16383 </td> <td>64 </td> <td>~19.2 </td></tr> <tr> <td><a href="/wiki/Quad_precision" class="mw-redirect" title="Quad precision">Quad</a> </td> <td>1 </td> <td>15 </td> <td>112 </td> <td>128 </td> <td>16383 </td> <td>113 </td> <td>~34.0 </td></tr></tbody></table> <p>While the exponent can be positive or negative, in binary formats it is stored as an unsigned number that has a fixed "bias" added to it. Values of all 0s in this field are reserved for the zeros and <a href="/wiki/Subnormal_numbers" class="mw-redirect" title="Subnormal numbers">subnormal numbers</a>; values of all 1s are reserved for the infinities and NaNs. The exponent range for normal numbers is [−126, 127] for single precision, [−1022, 1023] for double, or [−16382, 16383] for quad. Normal numbers exclude subnormal values, zeros, infinities, and NaNs. </p><p>In the IEEE binary interchange formats the leading 1 bit of a normalized significand is not actually stored in the computer datum. It is called the "hidden" or "implicit" bit. Because of this, the single-precision format actually has a significand with 24 bits of precision, the double-precision format has 53, and quad has 113. </p><p>For example, it was shown above that π, rounded to 24 bits of precision, has: </p> <ul><li>sign = 0 ; <i>e</i> = 1 ; <i>s</i> = 110010010000111111011011 (including the hidden bit)</li></ul> <p>The sum of the exponent bias (127) and the exponent (1) is 128, so this is represented in the single-precision format as </p> <ul><li>0 10000000 10010010000111111011011 (excluding the hidden bit) = 40490FDB<sup id="cite_ref-IEEE-754_Analysis_34-0" class="reference"><a href="#cite_note-IEEE-754_Analysis-34"><span class="cite-bracket">[</span>27<span class="cite-bracket">]</span></a></sup> as a <a href="/wiki/Hexadecimal" title="Hexadecimal">hexadecimal</a> number.</li></ul> <p>An example of a layout for <a href="/wiki/Single-precision_floating-point_format" title="Single-precision floating-point format">32-bit floating point</a> is </p> <figure class="mw-default-size mw-halign-none" typeof="mw:File"><a href="/wiki/File:Float_example.svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Float_example.svg/590px-Float_example.svg.png" decoding="async" width="590" height="75" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Float_example.svg/885px-Float_example.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Float_example.svg/1180px-Float_example.svg.png 2x" data-file-width="590" data-file-height="75" /></a><figcaption></figcaption></figure> <p>and the <a href="/wiki/Double-precision_floating-point_format" title="Double-precision floating-point format">64-bit ("double")</a> layout is similar. </p> <div class="mw-heading mw-heading2"><h2 id="Other_notable_floating-point_formats">Other notable floating-point formats</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=8" title="Edit section: Other notable floating-point formats"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In addition to the widely used <a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a> standard formats, other floating-point formats are used, or have been used, in certain domain-specific areas. </p> <ul><li>The <a href="/wiki/Microsoft_Binary_Format" title="Microsoft Binary Format">Microsoft Binary Format (MBF)</a> was developed for the Microsoft BASIC language products, including Microsoft's first ever product the <a href="/wiki/Altair_BASIC" title="Altair BASIC">Altair BASIC</a> (1975), <a href="/wiki/TRS-80" title="TRS-80">TRS-80 LEVEL II</a>, <a href="/wiki/CP/M" title="CP/M">CP/M</a>'s <a href="/wiki/MBASIC" title="MBASIC">MBASIC</a>, <a href="/wiki/IBM_PC_5150" class="mw-redirect" title="IBM PC 5150">IBM PC 5150</a>'s <a href="/wiki/BASICA" class="mw-redirect" title="BASICA">BASICA</a>, <a href="/wiki/MS-DOS" title="MS-DOS">MS-DOS</a>'s <a href="/wiki/GW-BASIC" title="GW-BASIC">GW-BASIC</a> and <a href="/wiki/QuickBASIC" title="QuickBASIC">QuickBASIC</a> prior to version 4.00. QuickBASIC version 4.00 and 4.50 switched to the IEEE 754-1985 format but can revert to the MBF format using the /MBF command option. MBF was designed and developed on a simulated <a href="/wiki/Intel_8080" title="Intel 8080">Intel 8080</a> by <a href="/wiki/Monte_Davidoff" title="Monte Davidoff">Monte Davidoff</a>, a dormmate of <a href="/wiki/Bill_Gates" title="Bill Gates">Bill Gates</a>, during spring of 1975 for the <a href="/wiki/MITS_Altair_8800" class="mw-redirect" title="MITS Altair 8800">MITS Altair 8800</a>. The initial release of July 1975 supported a single-precision (32 bits) format due to cost of the <a href="/wiki/MITS_Altair_8800" class="mw-redirect" title="MITS Altair 8800">MITS Altair 8800</a> 4-kilobytes memory. In December 1975, the 8-kilobytes version added a double-precision (64 bits) format. A single-precision (40 bits) variant format was adopted for other CPU's, notably the <a href="/wiki/MOS_6502" class="mw-redirect" title="MOS 6502">MOS 6502</a> (<a href="/wiki/Apple_//" class="mw-redirect" title="Apple //">Apple //</a>, <a href="/wiki/Commodore_PET" title="Commodore PET">Commodore PET</a>, <a href="/wiki/Atari" title="Atari">Atari</a>), <a href="/wiki/Motorola_6800" title="Motorola 6800">Motorola 6800</a> (MITS Altair 680) and <a href="/wiki/Motorola_6809" title="Motorola 6809">Motorola 6809</a> (<a href="/wiki/TRS-80_Color_Computer" title="TRS-80 Color Computer">TRS-80 Color Computer</a>). All Microsoft language products from 1975 through 1987 used the <a href="/wiki/Microsoft_Binary_Format" title="Microsoft Binary Format">Microsoft Binary Format</a> until Microsoft adopted the IEEE-754 standard format in all its products starting in 1988 to their current releases. MBF consists of the MBF single-precision format (32 bits, "6-digit BASIC"),<sup id="cite_ref-Borland_1994_MBF_35-0" class="reference"><a href="#cite_note-Borland_1994_MBF-35"><span class="cite-bracket">[</span>28<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Steil_2008_6502_36-0" class="reference"><a href="#cite_note-Steil_2008_6502-36"><span class="cite-bracket">[</span>29<span class="cite-bracket">]</span></a></sup> the MBF extended-precision format (40 bits, "9-digit BASIC"),<sup id="cite_ref-Steil_2008_6502_36-1" class="reference"><a href="#cite_note-Steil_2008_6502-36"><span class="cite-bracket">[</span>29<span class="cite-bracket">]</span></a></sup> and the MBF double-precision format (64 bits);<sup id="cite_ref-Borland_1994_MBF_35-1" class="reference"><a href="#cite_note-Borland_1994_MBF-35"><span class="cite-bracket">[</span>28<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Microsoft_2006_KB35826_37-0" class="reference"><a href="#cite_note-Microsoft_2006_KB35826-37"><span class="cite-bracket">[</span>30<span class="cite-bracket">]</span></a></sup> each of them is represented with an 8-bit exponent, followed by a sign bit, followed by a significand of respectively 23, 31, and 55 bits.</li> <li>The <a href="/wiki/Bfloat16_floating-point_format" title="Bfloat16 floating-point format">Bfloat16 format</a> requires the same amount of memory (16 bits) as the <a href="/wiki/Half-precision_floating-point_format" title="Half-precision floating-point format">IEEE 754 half-precision format</a>, but allocates 8 bits to the exponent instead of 5, thus providing the same range as a <a href="/wiki/Single-precision_floating-point_format" title="Single-precision floating-point format">IEEE 754 single-precision</a> number. The tradeoff is a reduced precision, as the trailing significand field is reduced from 10 to 7 bits. This format is mainly used in the training of <a href="/wiki/Machine_learning" title="Machine learning">machine learning</a> models, where range is more valuable than precision. Many machine learning accelerators provide hardware support for this format.</li> <li>The TensorFloat-32<sup id="cite_ref-Kharya_2020_38-0" class="reference"><a href="#cite_note-Kharya_2020-38"><span class="cite-bracket">[</span>31<span class="cite-bracket">]</span></a></sup> format combines the 8 bits of exponent of the Bfloat16 with the 10 bits of trailing significand field of half-precision formats, resulting in a size of 19 bits. This format was introduced by <a href="/wiki/Nvidia" title="Nvidia">Nvidia</a>, which provides hardware support for it in the Tensor Cores of its <a href="/wiki/Graphics_processing_unit" title="Graphics processing unit">GPUs</a> based on the Nvidia Ampere architecture. The drawback of this format is its size, which is not a power of 2. However, according to Nvidia, this format should only be used internally by hardware to speed up computations, while inputs and outputs should be stored in the 32-bit single-precision IEEE 754 format.<sup id="cite_ref-Kharya_2020_38-1" class="reference"><a href="#cite_note-Kharya_2020-38"><span class="cite-bracket">[</span>31<span class="cite-bracket">]</span></a></sup></li> <li>The <a href="/wiki/Hopper_(microarchitecture)" title="Hopper (microarchitecture)">Hopper</a> architecture GPUs provide two FP8 formats: one with the same numerical range as half-precision (E5M2) and one with higher precision, but less range (E4M3).<sup id="cite_ref-NVIDIA_Hopper_39-0" class="reference"><a href="#cite_note-NVIDIA_Hopper-39"><span class="cite-bracket">[</span>32<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Micikevicius_2022_40-0" class="reference"><a href="#cite_note-Micikevicius_2022-40"><span class="cite-bracket">[</span>33<span class="cite-bracket">]</span></a></sup></li></ul> <table class="wikitable"> <caption>Bfloat16, TensorFloat-32, and the two FP8 formats, compared with IEEE 754 half-precision and single-precision formats </caption> <tbody><tr> <th>Type </th> <th>Sign </th> <th>Exponent </th> <th>Trailing significand field </th> <th>Total bits </th></tr> <tr> <td>FP8 (E4M3) </td> <td>1 </td> <td>4 </td> <td>3 </td> <td>8 </td></tr> <tr> <td>FP8 (E5M2) </td> <td>1 </td> <td>5 </td> <td>2 </td> <td>8 </td></tr> <tr> <td><a href="/wiki/Half-precision_floating-point_format" title="Half-precision floating-point format">Half-precision</a> </td> <td>1 </td> <td>5 </td> <td>10 </td> <td>16 </td></tr> <tr> <td><a href="/wiki/Bfloat16_floating-point_format" title="Bfloat16 floating-point format">Bfloat16</a> </td> <td>1 </td> <td>8 </td> <td>7 </td> <td>16 </td></tr> <tr> <td>TensorFloat-32 </td> <td>1 </td> <td>8 </td> <td>10 </td> <td>19 </td></tr> <tr> <td><a href="/wiki/Single-precision_floating-point_format" title="Single-precision floating-point format">Single-precision</a> </td> <td>1 </td> <td>8 </td> <td>23 </td> <td>32 </td></tr></tbody></table> <div class="mw-heading mw-heading2"><h2 id="Representable_numbers,_conversion_and_rounding"><span id="Representable_numbers.2C_conversion_and_rounding"></span>Representable numbers, conversion and rounding <span class="anchor" id="Representable_numbers"></span></h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=9" title="Edit section: Representable numbers, conversion and rounding"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>By their nature, all numbers expressed in floating-point format are <a href="/wiki/Rational_number" title="Rational number">rational numbers</a> with a terminating expansion in the relevant base (for example, a terminating decimal expansion in base-10, or a terminating binary expansion in base-2). Irrational numbers, such as <a href="/wiki/Pi" title="Pi">π</a> or √2, or non-terminating rational numbers, must be approximated. The number of digits (or bits) of precision also limits the set of rational numbers that can be represented exactly. For example, the decimal number 123456789 cannot be exactly represented if only eight decimal digits of precision are available (it would be rounded to one of the two straddling representable values, 12345678 × 10<sup>1</sup> or 12345679 × 10<sup>1</sup>), the same applies to <a href="/wiki/Repeating_decimal" title="Repeating decimal">non-terminating digits</a> (.<span style="text-decoration:overline;">5</span> to be rounded to either .55555555 or .55555556). </p><p>When a number is represented in some format (such as a character string) which is not a native floating-point representation supported in a computer implementation, then it will require a conversion before it can be used in that implementation. If the number can be represented exactly in the floating-point format then the conversion is exact. If there is not an exact representation then the conversion requires a choice of which floating-point number to use to represent the original value. The representation chosen will have a different value from the original, and the value thus adjusted is called the <i>rounded value</i>. </p><p>Whether or not a rational number has a terminating expansion depends on the base. For example, in base-10 the number 1/2 has a terminating expansion (0.5) while the number 1/3 does not (0.333...). In base-2 only rationals with denominators that are powers of 2 (such as 1/2 or 3/16) are terminating. Any rational with a denominator that has a prime factor other than 2 will have an infinite binary expansion. This means that numbers that appear to be short and exact when written in decimal format may need to be approximated when converted to binary floating-point. For example, the decimal number 0.1 is not representable in binary floating-point of any finite precision; the exact binary representation would have a "1100" sequence continuing endlessly: </p> <dl><dd><i>e</i> = −4; <i>s</i> = 1100110011001100110011001100110011...,</dd></dl> <p>where, as previously, <i>s</i> is the significand and <i>e</i> is the exponent. </p><p>When rounded to 24 bits this becomes </p> <dl><dd><i>e</i> = −4; <i>s</i> = 110011001100110011001101,</dd></dl> <p>which is actually 0.100000001490116119384765625 in decimal. </p><p>As a further example, the real number <a href="/wiki/Pi" title="Pi">π</a>, represented in binary as an infinite sequence of bits is </p> <dl><dd>11.0010010000111111011010101000100010000101101000110000100011010011...</dd></dl> <p>but is </p> <dl><dd>11.0010010000111111011011</dd></dl> <p>when approximated by <a href="/wiki/Rounding" title="Rounding">rounding</a> to a precision of 24 bits. </p><p>In binary single-precision floating-point, this is represented as <i>s</i> = 1.10010010000111111011011 with <i>e</i> = 1. This has a decimal value of </p> <dl><dd><b>3.141592</b>7410125732421875,</dd></dl> <p>whereas a more accurate approximation of the true value of π is </p> <dl><dd><b>3.14159265358979323846264338327950</b>...</dd></dl> <p>The result of rounding differs from the true value by about 0.03 parts per million, and matches the decimal representation of π in the first 7 digits. The difference is the <a href="/wiki/Discretization_error" title="Discretization error">discretization error</a> and is limited by the <a href="/wiki/Machine_epsilon" title="Machine epsilon">machine epsilon</a>. </p><p>The arithmetical difference between two consecutive representable floating-point numbers which have the same exponent is called a <a href="/wiki/Unit_in_the_last_place" title="Unit in the last place">unit in the last place</a> (ULP). For example, if there is no representable number lying between the representable numbers 1.45a70c22<sub>hex</sub> and 1.45a70c24<sub>hex</sub>, the ULP is 2×16<sup>−8</sup>, or 2<sup>−31</sup>. For numbers with a base-2 exponent part of 0, i.e. numbers with an absolute value higher than or equal to 1 but lower than 2, an ULP is exactly 2<sup>−23</sup> or about 10<sup>−7</sup> in single precision, and exactly 2<sup>−53</sup> or about 10<sup>−16</sup> in double precision. The mandated behavior of IEEE-compliant hardware is that the result be within one-half of a ULP. </p> <div class="mw-heading mw-heading3"><h3 id="Rounding_modes">Rounding modes</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=10" title="Edit section: Rounding modes"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Rounding is used when the exact result of a floating-point operation (or a conversion to floating-point format) would need more digits than there are digits in the significand. IEEE 754 requires <i>correct rounding</i>: that is, the rounded result is as if infinitely precise arithmetic was used to compute the value and then rounded (although in implementation only three extra bits are needed to ensure this). There are several different <a href="/wiki/Rounding" title="Rounding">rounding</a> schemes (or <i>rounding modes</i>). Historically, <a href="/wiki/Truncation" title="Truncation">truncation</a> was the typical approach. Since the introduction of IEEE 754, the default method (<i><a href="/wiki/Rounding" title="Rounding">round to nearest, ties to even</a></i>, sometimes called Banker's Rounding) is more commonly used. This method rounds the ideal (infinitely precise) result of an arithmetic operation to the nearest representable value, and gives that representation as the result.<sup id="cite_ref-NB_1_41-0" class="reference"><a href="#cite_note-NB_1-41"><span class="cite-bracket">[</span>nb 8<span class="cite-bracket">]</span></a></sup> In the case of a tie, the value that would make the significand end in an even digit is chosen. The IEEE 754 standard requires the same rounding to be applied to all fundamental algebraic operations, including square root and conversions, when there is a numeric (non-NaN) result. It means that the results of IEEE 754 operations are completely determined in all bits of the result, except for the representation of NaNs. ("Library" functions such as cosine and log are not mandated.) </p><p>Alternative rounding options are also available. IEEE 754 specifies the following rounding modes: </p> <ul><li>round to nearest, where ties round to the nearest even digit in the required position (the default and by far the most common mode)</li> <li>round to nearest, where ties round away from zero (optional for binary floating-point and commonly used in decimal)</li> <li>round up (toward +∞; negative results thus round toward zero)</li> <li>round down (toward −∞; negative results thus round away from zero)</li> <li>round toward zero (truncation; it is similar to the common behavior of float-to-integer conversions, which convert −3.9 to −3 and 3.9 to 3)</li></ul> <p>Alternative modes are useful when the amount of error being introduced must be bounded. Applications that require a bounded error are multi-precision floating-point, and <a href="/wiki/Interval_arithmetic" title="Interval arithmetic">interval arithmetic</a>. The alternative rounding modes are also useful in diagnosing numerical instability: if the results of a subroutine vary substantially between rounding to + and − infinity then it is likely numerically unstable and affected by round-off error.<sup id="cite_ref-Kahan_2006_Mindless_42-0" class="reference"><a href="#cite_note-Kahan_2006_Mindless-42"><span class="cite-bracket">[</span>34<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Binary-to-decimal_conversion_with_minimal_number_of_digits">Binary-to-decimal conversion with minimal number of digits</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=11" title="Edit section: Binary-to-decimal conversion with minimal number of digits"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Converting a double-precision binary floating-point number to a decimal string is a common operation, but an algorithm producing results that are both accurate and minimal did not appear in print until 1990, with Steele and White's Dragon4. Some of the improvements since then include: </p> <ul><li>David M. Gay's <i>dtoa.c</i>, a practical open-source implementation of many ideas in Dragon4.<sup id="cite_ref-Gay_1990_43-0" class="reference"><a href="#cite_note-Gay_1990-43"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup></li> <li>Grisu3, with a 4× speedup as it removes the use of <a href="/wiki/Bignum" class="mw-redirect" title="Bignum">bignums</a>. Must be used with a fallback, as it fails for ~0.5% of cases.<sup id="cite_ref-Loitsch_2010_44-0" class="reference"><a href="#cite_note-Loitsch_2010-44"><span class="cite-bracket">[</span>36<span class="cite-bracket">]</span></a></sup></li> <li>Errol3, an always-succeeding algorithm similar to, but slower than, Grisu3. Apparently not as good as an early-terminating Grisu with fallback.<sup id="cite_ref-mazong_45-0" class="reference"><a href="#cite_note-mazong-45"><span class="cite-bracket">[</span>37<span class="cite-bracket">]</span></a></sup></li> <li>Ryū, an always-succeeding algorithm that is faster and simpler than Grisu3.<sup id="cite_ref-Adams_2018_46-0" class="reference"><a href="#cite_note-Adams_2018-46"><span class="cite-bracket">[</span>38<span class="cite-bracket">]</span></a></sup></li> <li>Schubfach, an always-succeeding algorithm that is based on a similar idea to Ryū, developed almost simultaneously and independently.<sup id="cite_ref-Giulietti_47-0" class="reference"><a href="#cite_note-Giulietti-47"><span class="cite-bracket">[</span>39<span class="cite-bracket">]</span></a></sup> Performs better than Ryū and Grisu3 in certain benchmarks.<sup id="cite_ref-abolz_48-0" class="reference"><a href="#cite_note-abolz-48"><span class="cite-bracket">[</span>40<span class="cite-bracket">]</span></a></sup></li></ul> <p>Many modern language runtimes use Grisu3 with a Dragon4 fallback.<sup id="cite_ref-double_conversion_2020_49-0" class="reference"><a href="#cite_note-double_conversion_2020-49"><span class="cite-bracket">[</span>41<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Decimal-to-binary_conversion">Decimal-to-binary conversion</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=12" title="Edit section: Decimal-to-binary conversion"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The problem of parsing a decimal string into a binary FP representation is complex, with an accurate parser not appearing until Clinger's 1990 work (implemented in dtoa.c).<sup id="cite_ref-Gay_1990_43-1" class="reference"><a href="#cite_note-Gay_1990-43"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> Further work has likewise progressed in the direction of faster parsing.<sup id="cite_ref-Lemire_2021_50-0" class="reference"><a href="#cite_note-Lemire_2021-50"><span class="cite-bracket">[</span>42<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Floating-point_operations">Floating-point operations</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=13" title="Edit section: Floating-point operations"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>For ease of presentation and understanding, decimal <a href="/wiki/Radix" title="Radix">radix</a> with 7 digit precision will be used in the examples, as in the IEEE 754 <i>decimal32</i> format. The fundamental principles are the same in any <a href="/wiki/Radix" title="Radix">radix</a> or precision, except that normalization is optional (it does not affect the numerical value of the result). Here, <i>s</i> denotes the significand and <i>e</i> denotes the exponent. </p> <div class="mw-heading mw-heading3"><h3 id="Addition_and_subtraction">Addition and subtraction</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=14" title="Edit section: Addition and subtraction"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A simple method to add floating-point numbers is to first represent them with the same exponent. In the example below, the second number (with the smaller exponent) is shifted right by three digits, and one then proceeds with the usual addition method: </p> <pre> 123456.7 = 1.234567 × 10^5 101.7654 = 1.017654 × 10^2 = 0.001017654 × 10^5 </pre> <pre> Hence: 123456.7 + 101.7654 = (1.234567 × 10^5) + (1.017654 × 10^2) = (1.234567 × 10^5) + (0.001017654 × 10^5) = (1.234567 + 0.001017654) × 10^5 = 1.235584654 × 10^5 </pre> <p>In detail: </p> <pre> e=5; s=1.234567 (123456.7) + e=2; s=1.017654 (101.7654) </pre> <pre> e=5; s=1.234567 + e=5; s=0.001017654 (after shifting) -------------------- e=5; s=1.235584654 (true sum: 123558.4654) </pre> <p>This is the true result, the exact sum of the operands. It will be rounded to seven digits and then normalized if necessary. The final result is </p> <pre> e=5; s=1.235585 (final sum: 123558.5) </pre> <p>The lowest three digits of the second operand (654) are essentially lost. This is <a href="/wiki/Round-off_error" title="Round-off error">round-off error</a>. In extreme cases, the sum of two non-zero numbers may be equal to one of them: </p> <pre> e=5; s=1.234567 + e=−3; s=9.876543 </pre> <pre> e=5; s=1.234567 + e=5; s=0.00000009876543 (after shifting) ---------------------- e=5; s=1.23456709876543 (true sum) e=5; s=1.234567 (after rounding and normalization) </pre> <p>In the above conceptual examples it would appear that a large number of extra digits would need to be provided by the adder to ensure correct rounding; however, for binary addition or subtraction using careful implementation techniques only a <i>guard</i> bit, a <i>rounding</i> bit and one extra <i>sticky</i> bit need to be carried beyond the precision of the operands.<sup id="cite_ref-Goldberg_1991_51-0" class="reference"><a href="#cite_note-Goldberg_1991-51"><span class="cite-bracket">[</span>43<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Patterson-Hennessy_2014_52-0" class="reference"><a href="#cite_note-Patterson-Hennessy_2014-52"><span class="cite-bracket">[</span>44<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page / location: 218–220">: 218–220 </span></sup> </p><p>Another problem of loss of significance occurs when <i>approximations</i> to two nearly equal numbers are subtracted. In the following example <i>e</i> = 5; <i>s</i> = 1.234571 and <i>e</i> = 5; <i>s</i> = 1.234567 are approximations to the rationals 123457.1467 and 123456.659. </p> <pre> e=5; s=1.234571 − e=5; s=1.234567 ---------------- e=5; s=0.000004 e=−1; s=4.000000 (after rounding and normalization) </pre> <p>The floating-point difference is computed exactly because the numbers are close—the <a href="/wiki/Sterbenz_lemma" title="Sterbenz lemma">Sterbenz lemma</a> guarantees this, even in case of underflow when <a href="/wiki/Gradual_underflow" class="mw-redirect" title="Gradual underflow">gradual underflow</a> is supported. Despite this, the difference of the original numbers is <i>e</i> = −1; <i>s</i> = 4.877000, which differs more than 20% from the difference <i>e</i> = −1; <i>s</i> = 4.000000 of the approximations. In extreme cases, all significant digits of precision can be lost.<sup id="cite_ref-Goldberg_1991_51-1" class="reference"><a href="#cite_note-Goldberg_1991-51"><span class="cite-bracket">[</span>43<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Sierra_1962_53-0" class="reference"><a href="#cite_note-Sierra_1962-53"><span class="cite-bracket">[</span>45<span class="cite-bracket">]</span></a></sup> This <i><a href="/wiki/Catastrophic_cancellation" title="Catastrophic cancellation">cancellation</a></i> illustrates the danger in assuming that all of the digits of a computed result are meaningful. Dealing with the consequences of these errors is a topic in <a href="/wiki/Numerical_analysis" title="Numerical analysis">numerical analysis</a>; see also <a href="#Accuracy_problems">Accuracy problems</a>. </p> <div class="mw-heading mw-heading3"><h3 id="Multiplication_and_division">Multiplication and division</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=15" title="Edit section: Multiplication and division"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>To multiply, the significands are multiplied while the exponents are added, and the result is rounded and normalized. </p> <pre> e=3; s=4.734612 × e=5; s=5.417242 ----------------------- e=8; s=25.648538980104 (true product) e=8; s=25.64854 (after rounding) e=9; s=2.564854 (after normalization) </pre> <p>Similarly, division is accomplished by subtracting the divisor's exponent from the dividend's exponent, and dividing the dividend's significand by the divisor's significand. </p><p>There are no cancellation or absorption problems with multiplication or division, though small errors may accumulate as operations are performed in succession.<sup id="cite_ref-Goldberg_1991_51-2" class="reference"><a href="#cite_note-Goldberg_1991-51"><span class="cite-bracket">[</span>43<span class="cite-bracket">]</span></a></sup> In practice, the way these operations are carried out in digital logic can be quite complex (see <a href="/wiki/Booth%27s_multiplication_algorithm" title="Booth's multiplication algorithm">Booth's multiplication algorithm</a> and <a href="/wiki/Division_algorithm" title="Division algorithm">Division algorithm</a>).<sup id="cite_ref-NB_2_54-0" class="reference"><a href="#cite_note-NB_2-54"><span class="cite-bracket">[</span>nb 9<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Literal_syntax">Literal syntax</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=16" title="Edit section: Literal syntax"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Literals for floating-point numbers depend on languages. They typically use <code>e</code> or <code>E</code> to denote <a href="/wiki/Scientific_notation" title="Scientific notation">scientific notation</a>. The <a href="/wiki/C_(programming_language)" title="C (programming language)">C programming language</a> and the <a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a> standard also define a <a href="/wiki/IEEE_754#Hexadecimal_literals" title="IEEE 754">hexadecimal literal syntax</a> with a base-2 exponent instead of 10. In languages like <a href="/wiki/C_(programming_language)" title="C (programming language)">C</a>, when the decimal exponent is omitted, a decimal point is needed to differentiate them from integers. Other languages do not have an integer type (such as <a href="/wiki/JavaScript" title="JavaScript">JavaScript</a>), or allow overloading of numeric types (such as <a href="/wiki/Haskell_(programming_language)" class="mw-redirect" title="Haskell (programming language)">Haskell</a>). In these cases, digit strings such as <code>123</code> may also be floating-point literals. </p><p>Examples of floating-point literals are: </p> <ul><li><code>99.9</code></li> <li><code>-5000.12</code></li> <li><code>6.02e23</code></li> <li><code>-3e-45</code></li> <li><code>0x1.fffffep+127</code> in C and IEEE 754</li></ul> <div class="mw-heading mw-heading2"><h2 id="Dealing_with_exceptional_cases">Dealing with exceptional cases <span class="anchor" id="Floating_point_exception"></span><span class="anchor" id="Exception_handling"></span></h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=17" title="Edit section: Dealing with exceptional cases"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951" /><div role="note" class="hatnote navigation-not-searchable">Further information: <a href="/wiki/IEEE_754#Exception_handling" title="IEEE 754">IEEE 754 § Exception handling</a></div> <p>Floating-point computation in a computer can run into three kinds of problems: </p> <ul><li>An operation can be mathematically undefined, such as ∞/∞, or <a href="/wiki/Division_by_zero" title="Division by zero">division by zero</a>.</li> <li>An operation can be legal in principle, but not supported by the specific format, for example, calculating the <a href="/wiki/Square_root" title="Square root">square root</a> of −1 or the inverse sine of 2 (both of which result in <a href="/wiki/Complex_number" title="Complex number">complex numbers</a>).</li> <li>An operation can be legal in principle, but the result can be impossible to represent in the specified format, because the exponent is too large or too small to encode in the exponent field. Such an event is called an overflow (exponent too large), <a href="/wiki/Arithmetic_underflow" title="Arithmetic underflow">underflow</a> (exponent too small) or <a href="/wiki/Subnormal_number" title="Subnormal number">denormalization</a> (precision loss).</li></ul> <p>Prior to the IEEE standard, such conditions usually caused the program to terminate, or triggered some kind of <a href="/wiki/Trap_(computing)" class="mw-redirect" title="Trap (computing)">trap</a> that the programmer might be able to catch. How this worked was system-dependent, meaning that floating-point programs were not <a href="/wiki/Porting" title="Porting">portable</a>. (The term "exception" as used in IEEE 754 is a general term meaning an exceptional condition, which is not necessarily an error, and is a different usage to that typically defined in programming languages such as a C++ or Java, in which an "<a href="/wiki/Exception_handling" title="Exception handling">exception</a>" is an alternative flow of control, closer to what is termed a "trap" in IEEE 754 terminology.) </p><p>Here, the required default method of handling exceptions according to IEEE 754 is discussed (the IEEE 754 optional trapping and other "alternate exception handling" modes are not discussed). Arithmetic exceptions are (by default) required to be recorded in "sticky" status flag bits. That they are "sticky" means that they are not reset by the next (arithmetic) operation, but stay set until explicitly reset. The use of "sticky" flags thus allows for testing of exceptional conditions to be delayed until after a full floating-point expression or subroutine: without them exceptional conditions that could not be otherwise ignored would require explicit testing immediately after every floating-point operation. By default, an operation always returns a result according to specification without interrupting computation. For instance, 1/0 returns +∞, while also setting the divide-by-zero flag bit (this default of ∞ is designed to often return a finite result when used in subsequent operations and so be safely ignored). </p><p>The original IEEE 754 standard, however, failed to recommend operations to handle such sets of arithmetic exception flag bits. So while these were implemented in hardware, initially programming language implementations typically did not provide a means to access them (apart from assembler). Over time some programming language standards (e.g., <a href="/wiki/C99" title="C99">C99</a>/C11 and Fortran) have been updated to specify methods to access and change status flag bits. The 2008 version of the IEEE 754 standard now specifies a few operations for accessing and handling the arithmetic flag bits. The programming model is based on a single thread of execution and use of them by multiple threads has to be handled by a <a href="/wiki/Concurrency_(computer_science)" title="Concurrency (computer science)">means</a> outside of the standard (e.g. <a href="/wiki/C11_(C_standard_revision)" title="C11 (C standard revision)">C11</a> specifies that the flags have <a href="/wiki/Thread-local_storage" title="Thread-local storage">thread-local storage</a>). </p><p>IEEE 754 specifies five arithmetic exceptions that are to be recorded in the status flags ("sticky bits"): </p> <ul><li><b>inexact</b>, set if the rounded (and returned) value is different from the mathematically exact result of the operation.</li> <li><b>underflow</b>, set if the rounded value is tiny (as specified in IEEE 754) <i>and</i> inexact (or maybe limited to if it has denormalization loss, as per the 1985 version of IEEE 754), returning a subnormal value including the zeros.</li> <li><b>overflow</b>, set if the absolute value of the rounded value is too large to be represented. An infinity or maximal finite value is returned, depending on which rounding is used.</li> <li><b>divide-by-zero</b>, set if the result is infinite given finite operands, returning an infinity, either +∞ or −∞.</li> <li><b>invalid</b>, set if a finite or infinite result cannot be returned e.g. sqrt(−1) or 0/0, returning a quiet NaN.</li></ul> <figure class="mw-halign-left" typeof="mw:File/Thumb"><a href="/wiki/File:Resistors_in_Parallel.svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/6/64/Resistors_in_Parallel.svg/200px-Resistors_in_Parallel.svg.png" decoding="async" width="200" height="80" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/64/Resistors_in_Parallel.svg/300px-Resistors_in_Parallel.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/64/Resistors_in_Parallel.svg/400px-Resistors_in_Parallel.svg.png 2x" data-file-width="300" data-file-height="120" /></a><figcaption>Fig. 1: resistances in parallel, with total resistance <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle R_{tot}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mi>o</mi> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle R_{tot}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5db9ad2846f85f109dcfaaa60106f69304ff2377" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:3.981ex; height:2.509ex;" alt="{\displaystyle R_{tot}}" /></span></figcaption></figure><p>The default return value for each of the exceptions is designed to give the correct result in the majority of cases such that the exceptions can be ignored in the majority of codes. <i>inexact</i> returns a correctly rounded result, and <i>underflow</i> returns a value less than or equal to the smallest positive normal number in magnitude and can almost always be ignored.<sup id="cite_ref-Kahan_1997_Status_55-0" class="reference"><a href="#cite_note-Kahan_1997_Status-55"><span class="cite-bracket">[</span>46<span class="cite-bracket">]</span></a></sup> <i>divide-by-zero</i> returns infinity exactly, which will typically then divide a finite number and so give zero, or else will give an <i>invalid</i> exception subsequently if not, and so can also typically be ignored. For example, the effective resistance of n resistors in parallel (see fig. 1) is given by <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle R_{\text{tot}}=1/(1/R_{1}+1/R_{2}+\cdots +1/R_{n})}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mtext>tot</mtext> </mrow> </msub> <mo>=</mo> <mn>1</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mo stretchy="false">(</mo> <mn>1</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>+</mo> <mn>1</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>+</mo> <mo>⋯<!-- ⋯ --></mo> <mo>+</mo> <mn>1</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle R_{\text{tot}}=1/(1/R_{1}+1/R_{2}+\cdots +1/R_{n})}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/cd57a135e4c4097f3aac012da434c472b1fbe90b" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:38.168ex; height:2.843ex;" alt="{\displaystyle R_{\text{tot}}=1/(1/R_{1}+1/R_{2}+\cdots +1/R_{n})}" /></span>. If a short-circuit develops with <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle R_{1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle R_{1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c1d63c96f59d98589d923c4f0b04222feaa7283e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:2.818ex; height:2.509ex;" alt="{\displaystyle R_{1}}" /></span> set to 0, <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 1/R_{1}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>1</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 1/R_{1}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/e6e4aaeaef2ceb43e4b429f07d7bfd798d3db831" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:5.143ex; height:2.843ex;" alt="{\displaystyle 1/R_{1}}" /></span> will return +infinity which will give a final <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle R_{tot}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>R</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>t</mi> <mi>o</mi> <mi>t</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle R_{tot}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5db9ad2846f85f109dcfaaa60106f69304ff2377" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:3.981ex; height:2.509ex;" alt="{\displaystyle R_{tot}}" /></span> of 0, as expected<sup id="cite_ref-Intel_56-0" class="reference"><a href="#cite_note-Intel-56"><span class="cite-bracket">[</span>47<span class="cite-bracket">]</span></a></sup> (see the continued fraction example of <a href="/wiki/Floating_point#IEEE_754:_floating_point_in_modern_computers" class="mw-redirect" title="Floating point">IEEE 754 design rationale</a> for another example). </p><p><i>Overflow</i> and <i>invalid</i> exceptions can typically not be ignored, but do not necessarily represent errors: for example, a <a href="/wiki/Zero_of_a_function" title="Zero of a function">root-finding</a> routine, as part of its normal operation, may evaluate a passed-in function at values outside of its domain, returning NaN and an <i>invalid</i> exception flag to be ignored until finding a useful start point.<sup id="cite_ref-Kahan_1997_Status_55-1" class="reference"><a href="#cite_note-Kahan_1997_Status-55"><span class="cite-bracket">[</span>46<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Accuracy_problems">Accuracy problems</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=18" title="Edit section: Accuracy problems"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The fact that floating-point numbers cannot accurately represent all real numbers, and that floating-point operations cannot accurately represent true arithmetic operations, leads to many surprising situations. This is related to the finite <a href="/wiki/Precision_(computer_science)" title="Precision (computer science)">precision</a> with which computers generally represent numbers. </p><p>For example, the decimal numbers 0.1 and 0.01 cannot be represented exactly as binary floating-point numbers. In the IEEE 754 binary32 format with its 24-bit significand, the result of attempting to square the approximation to 0.1 is neither 0.01 nor the representable number closest to it. The decimal number 0.1 is represented in binary as <span class="texhtml"><var style="padding-right: 1px;">e</var> = −4</span>; <span class="texhtml"><var style="padding-right: 1px;">s</var> = 110011001100110011001101</span>, which is </p> <style data-mw-deduplicate="TemplateStyles:r996643573">.mw-parser-output .block-indent{padding-left:3em;padding-right:0;overflow:hidden}</style><div class="block-indent">0.100000001490116119384765625 exactly.</div> <p>Squaring this number gives </p> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r996643573" /><div class="block-indent">0.010000000298023226097399174250313080847263336181640625 exactly.</div> <p>Squaring it with rounding to the 24-bit precision gives </p> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r996643573" /><div class="block-indent">0.010000000707805156707763671875 exactly.</div> <p>But the representable number closest to 0.01 is </p> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r996643573" /><div class="block-indent">0.009999999776482582092285156250 exactly.</div> <p>Also, the non-representability of π (and π/2) means that an attempted computation of tan(π/2) will not yield a result of infinity, nor will it even overflow in the usual floating-point formats (assuming an accurate implementation of tan). It is simply not possible for standard floating-point hardware to attempt to compute tan(π/2), because π/2 cannot be represented exactly. This computation in C: </p> <div class="mw-highlight mw-highlight-lang-c mw-content-ltr" dir="ltr"><pre><span></span><span class="cm">/* Enough digits to be sure we get the correct approximation. */</span> <span class="kt">double</span><span class="w"> </span><span class="n">pi</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mf">3.1415926535897932384626433832795</span><span class="p">;</span> <span class="kt">double</span><span class="w"> </span><span class="n">z</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">tan</span><span class="p">(</span><span class="n">pi</span><span class="o">/</span><span class="mf">2.0</span><span class="p">);</span> </pre></div> <p>will give a result of 16331239353195370.0. In single precision (using the <code>tanf</code> function), the result will be −22877332.0. </p><p>By the same token, an attempted computation of sin(π) will not yield zero. The result will be (approximately) 0.1225<span style="margin:0 .15em 0 .25em">×</span>10<sup>−15</sup> in double precision, or −0.8742<span style="margin:0 .15em 0 .25em">×</span>10<sup>−7</sup> in single precision.<sup id="cite_ref-NB_3_57-0" class="reference"><a href="#cite_note-NB_3-57"><span class="cite-bracket">[</span>nb 10<span class="cite-bracket">]</span></a></sup> </p><p>While floating-point addition and multiplication are both <a href="/wiki/Commutative" class="mw-redirect" title="Commutative">commutative</a> (<span class="texhtml"><var style="padding-right: 1px;">a</var> + <var style="padding-right: 1px;">b</var> = <var style="padding-right: 1px;">b</var> + <var style="padding-right: 1px;">a</var></span> and <span class="texhtml"><var style="padding-right: 1px;">a</var> × <var style="padding-right: 1px;">b</var> = <var style="padding-right: 1px;">b</var> × <var style="padding-right: 1px;">a</var></span>), they are not necessarily <a href="/wiki/Associative_property" title="Associative property">associative</a>. That is, <span class="texhtml">(<var style="padding-right: 1px;">a</var> + <var style="padding-right: 1px;">b</var>) + <var style="padding-right: 1px;">c</var></span> is not necessarily equal to <span class="texhtml"><var style="padding-right: 1px;">a</var> + (<var style="padding-right: 1px;">b</var> + <var style="padding-right: 1px;">c</var>)</span>. Using 7-digit significand decimal arithmetic: </p> <pre> a = 1234.567, b = 45.67834, c = 0.0004 </pre> <pre> (a + b) + c: 1234.567 (a) + 45.67834 (b) ____________ 1280.24534 rounds to 1280.245 </pre> <pre> 1280.245 (a + b) + 0.0004 (c) ____________ 1280.2454 rounds to <b>1280.245</b> ← (a + b) + c </pre> <pre> a + (b + c): 45.67834 (b) + 0.0004 (c) ____________ 45.67874 </pre> <pre> 1234.567 (a) + 45.67874 (b + c) ____________ 1280.24574 rounds to <b>1280.246</b> ← a + (b + c) </pre> <p>They are also not necessarily <a href="/wiki/Distributive_property" title="Distributive property">distributive</a>. That is, <span class="texhtml">(<var style="padding-right: 1px;">a</var> + <var style="padding-right: 1px;">b</var>) × <var style="padding-right: 1px;">c</var></span> may not be the same as <span class="texhtml"><var style="padding-right: 1px;">a</var> × <var style="padding-right: 1px;">c</var> + <var style="padding-right: 1px;">b</var> × <var style="padding-right: 1px;">c</var></span>: </p> <pre> 1234.567 × 3.333333 = 4115.223 1.234567 × 3.333333 = 4.115223 4115.223 + 4.115223 = 4119.338 but 1234.567 + 1.234567 = 1235.802 1235.802 × 3.333333 = 4119.340 </pre> <p>In addition to loss of significance, inability to represent numbers such as π and 0.1 exactly, and other slight inaccuracies, the following phenomena may occur: </p> <div><ul><li><a href="/wiki/Catastrophic_cancellation" title="Catastrophic cancellation">Cancellation</a>: subtraction of nearly equal operands may cause extreme loss of accuracy.<sup id="cite_ref-Harris_58-0" class="reference"><a href="#cite_note-Harris-58"><span class="cite-bracket">[</span>48<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Sierra_1962_53-1" class="reference"><a href="#cite_note-Sierra_1962-53"><span class="cite-bracket">[</span>45<span class="cite-bracket">]</span></a></sup> When we subtract two almost equal numbers we set the most significant digits to zero, leaving ourselves with just the insignificant, and most erroneous, digits.<sup id="cite_ref-Muller_2010_1-5" class="reference"><a href="#cite_note-Muller_2010-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Page / location: 124">: 124 </span></sup> For example, when determining a <a href="/wiki/Derivative" title="Derivative">derivative</a> of a function the following formula is used: <p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle Q(h)={\frac {f(a+h)-f(a)}{h}}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>Q</mi> <mo stretchy="false">(</mo> <mi>h</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>f</mi> <mo stretchy="false">(</mo> <mi>a</mi> <mo>+</mo> <mi>h</mi> <mo stretchy="false">)</mo> <mo>−<!-- − --></mo> <mi>f</mi> <mo stretchy="false">(</mo> <mi>a</mi> <mo stretchy="false">)</mo> </mrow> <mi>h</mi> </mfrac> </mrow> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle Q(h)={\frac {f(a+h)-f(a)}{h}}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/bb3cb4002e059cfdbf6e76b86efc59519ee208d3" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.005ex; width:25.223ex; height:5.843ex;" alt="{\displaystyle Q(h)={\frac {f(a+h)-f(a)}{h}}.}" /></span> </p> Intuitively one would want an <span class="texhtml"><var style="padding-right: 1px;">h</var></span> very close to zero; however, when using floating-point operations, the smallest number will not give the best approximation of a derivative. As <span class="texhtml"><var style="padding-right: 1px;">h</var></span> grows smaller, the difference between <span class="texhtml"><var style="padding-right: 1px;">f</var>(<var style="padding-right: 1px;">a</var> + <var style="padding-right: 1px;">h</var>)</span> and <span class="texhtml"><var style="padding-right: 1px;">f</var>(<var style="padding-right: 1px;">a</var>)</span> grows smaller, cancelling out the most significant and least erroneous digits and making the most erroneous digits more important. As a result the smallest number of <span class="texhtml"><var style="padding-right: 1px;">h</var></span> possible will give a more erroneous approximation of a derivative than a somewhat larger number. This is perhaps the most common and serious accuracy problem.</li><li>Conversions to integer are not intuitive: converting (63.0/9.0) to integer yields 7, but converting (0.63/0.09) may yield 6. This is because conversions generally truncate rather than round. <a href="/wiki/Floor_and_ceiling_functions" title="Floor and ceiling functions">Floor and ceiling functions</a> may produce answers which are off by one from the intuitively expected value.</li><li>Limited exponent range: results might overflow yielding infinity, or underflow yielding a <a href="/wiki/Subnormal_number" title="Subnormal number">subnormal number</a> or zero. In these cases precision will be lost.</li><li>Testing for <a href="/wiki/Division_by_zero#Computer_arithmetic" title="Division by zero">safe division</a> is problematic: Checking that the divisor is not zero does not guarantee that a division will not overflow.</li><li>Testing for equality is problematic. Two computational sequences that are mathematically equal may well produce different floating-point values.<sup id="cite_ref-Barker_59-0" class="reference"><a href="#cite_note-Barker-59"><span class="cite-bracket">[</span>49<span class="cite-bracket">]</span></a></sup></li></ul></div> <div class="mw-heading mw-heading3"><h3 id="Incidents">Incidents</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=19" title="Edit section: Incidents"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li>On 25 February 1991, a <a href="/wiki/Loss_of_significance" class="mw-redirect" title="Loss of significance">loss of significance</a> in a <a href="/wiki/MIM-104_Patriot" title="MIM-104 Patriot">MIM-104 Patriot</a> missile battery <a href="/wiki/MIM-104_Patriot#Failure_at_Dhahran" title="MIM-104 Patriot">prevented it from intercepting</a> an incoming <a href="/wiki/Al_Hussein_(missile)" class="mw-redirect" title="Al Hussein (missile)">Scud</a> missile in <a href="/wiki/Dhahran" title="Dhahran">Dhahran</a>, <a href="/wiki/Saudi_Arabia" title="Saudi Arabia">Saudi Arabia</a>, contributing to the death of 28 soldiers from the U.S. Army's <a href="/wiki/14th_Quartermaster_Detachment" title="14th Quartermaster Detachment">14th Quartermaster Detachment</a>.<sup id="cite_ref-GAO_report_IMTEC_92-26_60-0" class="reference"><a href="#cite_note-GAO_report_IMTEC_92-26-60"><span class="cite-bracket">[</span>50<span class="cite-bracket">]</span></a></sup> The error was actually introduced by a <a href="/wiki/Fixed-point_arithmetic" title="Fixed-point arithmetic">fixed-point</a> computation,<sup id="cite_ref-Skeel_61-0" class="reference"><a href="#cite_note-Skeel-61"><span class="cite-bracket">[</span>51<span class="cite-bracket">]</span></a></sup> but the underlying issue would have been the same with floating-point arithmetic.</li> <li><span class="cleanup-needed-content" style="padding-left:0.1em; padding-right:0.1em; color:var(--color-subtle, #54595d); border:1px solid var(--border-color-subtle, #c8ccd1);"><a href="/wiki/Salami_slicing_tactics#Financial_schemes" title="Salami slicing tactics">Salami slicing</a> is the practice of removing the 'invisible' part of a transaction into a separate account.</span><sup class="noprint Inline-Template" style="margin-left:0.1em; white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Please_clarify" title="Wikipedia:Please clarify"><span title="It is not clear how this is an incident (the section title may have to be modified to cover more than incidents) and how this is due to floating-point arithmetic (rather than number approximations in general). The term 'invisible' may also be misleading without following explanations. (November 2024)">clarification needed</span></a></i>]</sup></li></ul> <div class="mw-heading mw-heading3"><h3 id="Machine_precision_and_backward_error_analysis">Machine precision and backward error analysis</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=20" title="Edit section: Machine precision and backward error analysis"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p><i>Machine precision</i> is a quantity that characterizes the accuracy of a floating-point system, and is used in <a href="/wiki/Error_analysis_(mathematics)#Error_analysis_in_numerical_modeling" title="Error analysis (mathematics)">backward error analysis</a> of floating-point algorithms. It is also known as unit roundoff or <i><a href="/wiki/Machine_epsilon" title="Machine epsilon">machine epsilon</a></i>. Usually denoted <span class="texhtml"><var style="padding-right: 1px;">Ε</var><sub>mach</sub></span>, its value depends on the particular rounding being used. </p><p>With rounding to zero, <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {E} _{\text{mach}}=B^{1-P},\,}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">E</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>mach</mtext> </mrow> </msub> <mo>=</mo> <msup> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mo>−<!-- − --></mo> <mi>P</mi> </mrow> </msup> <mo>,</mo> <mspace width="thinmathspace"></mspace> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {E} _{\text{mach}}=B^{1-P},\,}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/16192cc80058cfea0162debfd31dc11a422b61dd" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:15.113ex; height:3.009ex;" alt="{\displaystyle \mathrm {E} _{\text{mach}}=B^{1-P},\,}" /></span> whereas rounding to nearest, <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \mathrm {E} _{\text{mach}}={\tfrac {1}{2}}B^{1-P},}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">E</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>mach</mtext> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <mfrac> <mn>1</mn> <mn>2</mn> </mfrac> </mstyle> </mrow> <msup> <mi>B</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> <mo>−<!-- − --></mo> <mi>P</mi> </mrow> </msup> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \mathrm {E} _{\text{mach}}={\tfrac {1}{2}}B^{1-P},}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/5b55346e447384f9497ffbb42d68a91280ec9974" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.171ex; width:16.384ex; height:3.509ex;" alt="{\displaystyle \mathrm {E} _{\text{mach}}={\tfrac {1}{2}}B^{1-P},}" /></span> where <i>B</i> is the base of the system and <i>P</i> is the precision of the significand (in base <i>B</i>). </p><p>This is important since it bounds the <i><a href="/wiki/Relative_error" class="mw-redirect" title="Relative error">relative error</a></i> in representing any non-zero real number <span class="texhtml"><var style="padding-right: 1px;">x</var></span> within the normalized range of a floating-point system: <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \left|{\frac {\operatorname {fl} (x)-x}{x}}\right|\leq \mathrm {E} _{\text{mach}}.}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow> <mo>|</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mi>fl</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mi>x</mi> <mo stretchy="false">)</mo> <mo>−<!-- − --></mo> <mi>x</mi> </mrow> <mi>x</mi> </mfrac> </mrow> <mo>|</mo> </mrow> <mo>≤<!-- ≤ --></mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">E</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>mach</mtext> </mrow> </msub> <mo>.</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \left|{\frac {\operatorname {fl} (x)-x}{x}}\right|\leq \mathrm {E} _{\text{mach}}.}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/853988a2e76c33411e1dbb3579a37fa35acb927c" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.671ex; width:20.193ex; height:6.509ex;" alt="{\displaystyle \left|{\frac {\operatorname {fl} (x)-x}{x}}\right|\leq \mathrm {E} _{\text{mach}}.}" /></span> </p><p>Backward error analysis, the theory of which was developed and popularized by <a href="/wiki/James_H._Wilkinson" title="James H. Wilkinson">James H. Wilkinson</a>, can be used to establish that an algorithm implementing a numerical function is numerically stable.<sup id="cite_ref-RalstonReilly2003_62-0" class="reference"><a href="#cite_note-RalstonReilly2003-62"><span class="cite-bracket">[</span>52<span class="cite-bracket">]</span></a></sup> The basic approach is to show that although the calculated result, due to roundoff errors, will not be exactly correct, it is the exact solution to a nearby problem with slightly perturbed input data. If the perturbation required is small, on the order of the uncertainty in the input data, then the results are in some sense as accurate as the data "deserves". The algorithm is then defined as <i><a href="/wiki/Numerical_stability#Forward,_backward,_and_mixed_stability" title="Numerical stability">backward stable</a></i>. Stability is a measure of the sensitivity to rounding errors of a given numerical procedure; by contrast, the <a href="/wiki/Condition_number" title="Condition number">condition number</a> of a function for a given problem indicates the inherent sensitivity of the function to small perturbations in its input and is independent of the implementation used to solve the problem.<sup id="cite_ref-Einarsson_2005_63-0" class="reference"><a href="#cite_note-Einarsson_2005-63"><span class="cite-bracket">[</span>53<span class="cite-bracket">]</span></a></sup> </p><p>As a trivial example, consider a simple expression giving the inner product of (length two) vectors <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle x}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>x</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle x}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/87f9e315fd7e2ba406057a97300593c4802b53e4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:1.33ex; height:1.676ex;" alt="{\displaystyle x}" /></span> and <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle y}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>y</mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle y}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b8a6208ec717213d4317e666f1ae872e00620a0d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:1.155ex; height:2.009ex;" alt="{\displaystyle y}" /></span>, then <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}\operatorname {fl} (x\cdot y)&=\operatorname {fl} {\big (}\operatorname {fl} (x_{1}\cdot y_{1})+\operatorname {fl} (x_{2}\cdot y_{2}){\big )},&&{\text{ where }}\operatorname {fl} (){\text{ indicates correctly rounded floating-point arithmetic}}\\&=\operatorname {fl} {\big (}(x_{1}\cdot y_{1})(1+\delta _{1})+(x_{2}\cdot y_{2})(1+\delta _{2}){\big )},&&{\text{ where }}\delta _{n}\leq \mathrm {E} _{\text{mach}},{\text{ from above}}\\&={\big (}(x_{1}\cdot y_{1})(1+\delta _{1})+(x_{2}\cdot y_{2})(1+\delta _{2}){\big )}(1+\delta _{3})\\&=(x_{1}\cdot y_{1})(1+\delta _{1})(1+\delta _{3})+(x_{2}\cdot y_{2})(1+\delta _{2})(1+\delta _{3}),\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <mi>fl</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mi>x</mi> <mo>⋅<!-- ⋅ --></mo> <mi>y</mi> <mo stretchy="false">)</mo> </mtd> <mtd> <mi></mi> <mo>=</mo> <mi>fl</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">(</mo> </mrow> </mrow> <mi>fl</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>+</mo> <mi>fl</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">)</mo> </mrow> </mrow> <mo>,</mo> </mtd> <mtd></mtd> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext> where </mtext> </mrow> <mi>fl</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext> indicates correctly rounded floating-point arithmetic</mtext> </mrow> </mtd> </mtr> <mtr> <mtd></mtd> <mtd> <mi></mi> <mo>=</mo> <mi>fl</mi> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">(</mo> </mrow> </mrow> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>+</mo> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">)</mo> </mrow> </mrow> <mo>,</mo> </mtd> <mtd></mtd> <mtd> <mrow class="MJX-TeXAtom-ORD"> <mtext> where </mtext> </mrow> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo>≤<!-- ≤ --></mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">E</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>mach</mtext> </mrow> </msub> <mo>,</mo> <mrow class="MJX-TeXAtom-ORD"> <mtext> from above</mtext> </mrow> </mtd> </mtr> <mtr> <mtd></mtd> <mtd> <mi></mi> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">(</mo> </mrow> </mrow> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>+</mo> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mo maxsize="1.2em" minsize="1.2em">)</mo> </mrow> </mrow> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> </msub> <mo stretchy="false">)</mo> </mtd> </mtr> <mtr> <mtd></mtd> <mtd> <mi></mi> <mo>=</mo> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>+</mo> <mo stretchy="false">(</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo>⋅<!-- ⋅ --></mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>,</mo> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}\operatorname {fl} (x\cdot y)&=\operatorname {fl} {\big (}\operatorname {fl} (x_{1}\cdot y_{1})+\operatorname {fl} (x_{2}\cdot y_{2}){\big )},&&{\text{ where }}\operatorname {fl} (){\text{ indicates correctly rounded floating-point arithmetic}}\\&=\operatorname {fl} {\big (}(x_{1}\cdot y_{1})(1+\delta _{1})+(x_{2}\cdot y_{2})(1+\delta _{2}){\big )},&&{\text{ where }}\delta _{n}\leq \mathrm {E} _{\text{mach}},{\text{ from above}}\\&={\big (}(x_{1}\cdot y_{1})(1+\delta _{1})+(x_{2}\cdot y_{2})(1+\delta _{2}){\big )}(1+\delta _{3})\\&=(x_{1}\cdot y_{1})(1+\delta _{1})(1+\delta _{3})+(x_{2}\cdot y_{2})(1+\delta _{2})(1+\delta _{3}),\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91cfb9605c32f283534751331630a5a4fd1e0e97" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -6.171ex; width:130.907ex; height:13.509ex;" alt="{\displaystyle {\begin{aligned}\operatorname {fl} (x\cdot y)&=\operatorname {fl} {\big (}\operatorname {fl} (x_{1}\cdot y_{1})+\operatorname {fl} (x_{2}\cdot y_{2}){\big )},&&{\text{ where }}\operatorname {fl} (){\text{ indicates correctly rounded floating-point arithmetic}}\\&=\operatorname {fl} {\big (}(x_{1}\cdot y_{1})(1+\delta _{1})+(x_{2}\cdot y_{2})(1+\delta _{2}){\big )},&&{\text{ where }}\delta _{n}\leq \mathrm {E} _{\text{mach}},{\text{ from above}}\\&={\big (}(x_{1}\cdot y_{1})(1+\delta _{1})+(x_{2}\cdot y_{2})(1+\delta _{2}){\big )}(1+\delta _{3})\\&=(x_{1}\cdot y_{1})(1+\delta _{1})(1+\delta _{3})+(x_{2}\cdot y_{2})(1+\delta _{2})(1+\delta _{3}),\end{aligned}}}" /></span> and so <span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \operatorname {fl} (x\cdot y)={\hat {x}}\cdot {\hat {y}},}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>fl</mi> <mo>⁡<!-- --></mo> <mo stretchy="false">(</mo> <mi>x</mi> <mo>⋅<!-- ⋅ --></mo> <mi>y</mi> <mo stretchy="false">)</mo> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">^<!-- ^ --></mo> </mover> </mrow> </mrow> <mo>⋅<!-- ⋅ --></mo> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>y</mi> <mo stretchy="false">^<!-- ^ --></mo> </mover> </mrow> </mrow> <mo>,</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \operatorname {fl} (x\cdot y)={\hat {x}}\cdot {\hat {y}},}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/aaefe06c6b20bb5187afd515c1290cc79d51b170" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:15.389ex; height:2.843ex;" alt="{\displaystyle \operatorname {fl} (x\cdot y)={\hat {x}}\cdot {\hat {y}},}" /></span> </p><p>where </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle {\begin{aligned}{\hat {x}}_{1}&=x_{1}(1+\delta _{1});&{\hat {x}}_{2}&=x_{2}(1+\delta _{2});\\{\hat {y}}_{1}&=y_{1}(1+\delta _{3});&{\hat {y}}_{2}&=y_{2}(1+\delta _{3}),\\\end{aligned}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mrow class="MJX-TeXAtom-ORD"> <mtable columnalign="right left right left right left right left right left right left" rowspacing="3pt" columnspacing="0em 2em 0em 2em 0em 2em 0em 2em 0em 2em 0em" displaystyle="true"> <mtr> <mtd> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">^<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mtd> <mtd> <mi></mi> <mo>=</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>;</mo> </mtd> <mtd> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>x</mi> <mo stretchy="false">^<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> </mtd> <mtd> <mi></mi> <mo>=</mo> <msub> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>;</mo> </mtd> </mtr> <mtr> <mtd> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>y</mi> <mo stretchy="false">^<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> </mtd> <mtd> <mi></mi> <mo>=</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>1</mn> </mrow> </msub> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>;</mo> </mtd> <mtd> <msub> <mrow class="MJX-TeXAtom-ORD"> <mrow class="MJX-TeXAtom-ORD"> <mover> <mi>y</mi> <mo stretchy="false">^<!-- ^ --></mo> </mover> </mrow> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> </mtd> <mtd> <mi></mi> <mo>=</mo> <msub> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msub> <mo stretchy="false">(</mo> <mn>1</mn> <mo>+</mo> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mn>3</mn> </mrow> </msub> <mo stretchy="false">)</mo> <mo>,</mo> </mtd> </mtr> </mtable> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle {\begin{aligned}{\hat {x}}_{1}&=x_{1}(1+\delta _{1});&{\hat {x}}_{2}&=x_{2}(1+\delta _{2});\\{\hat {y}}_{1}&=y_{1}(1+\delta _{3});&{\hat {y}}_{2}&=y_{2}(1+\delta _{3}),\\\end{aligned}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/0974c5b2aec0cd595a21dbd756adc581fe26fe7e" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -2.505ex; width:38.221ex; height:6.176ex;" alt="{\displaystyle {\begin{aligned}{\hat {x}}_{1}&=x_{1}(1+\delta _{1});&{\hat {x}}_{2}&=x_{2}(1+\delta _{2});\\{\hat {y}}_{1}&=y_{1}(1+\delta _{3});&{\hat {y}}_{2}&=y_{2}(1+\delta _{3}),\\\end{aligned}}}" /></span> </p><p>where </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-display mwe-math-mathml-a11y" style="display: none;"><math display="block" xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \delta _{n}\leq \mathrm {E} _{\text{mach}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msub> <mi>δ<!-- δ --></mi> <mrow class="MJX-TeXAtom-ORD"> <mi>n</mi> </mrow> </msub> <mo>≤<!-- ≤ --></mo> <msub> <mrow class="MJX-TeXAtom-ORD"> <mi mathvariant="normal">E</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mtext>mach</mtext> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \delta _{n}\leq \mathrm {E} _{\text{mach}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9dab0eb7295a461df0cef551eb4afd052c966c04" class="mwe-math-fallback-image-display mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:10.999ex; height:2.676ex;" alt="{\displaystyle \delta _{n}\leq \mathrm {E} _{\text{mach}}}" /></span> </p><p>by definition, which is the sum of two slightly perturbed (on the order of Ε<sub>mach</sub>) input data, and so is backward stable. For more realistic examples in <a href="/wiki/Numerical_linear_algebra" title="Numerical linear algebra">numerical linear algebra</a>, see Higham 2002<sup id="cite_ref-Higham_2002_64-0" class="reference"><a href="#cite_note-Higham_2002-64"><span class="cite-bracket">[</span>54<span class="cite-bracket">]</span></a></sup> and other references below. </p> <div class="mw-heading mw-heading3"><h3 id="Minimizing_the_effect_of_accuracy_problems">Minimizing the effect of accuracy problems</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=21" title="Edit section: Minimizing the effect of accuracy problems"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Although individual arithmetic operations of IEEE 754 are guaranteed accurate to within half a <a href="/wiki/Unit_in_the_last_place" title="Unit in the last place">ULP</a>, more complicated formulae can suffer from larger errors for a variety of reasons. The loss of accuracy can be substantial if a problem or its data are <a href="/wiki/Condition_number" title="Condition number">ill-conditioned</a>, meaning that the correct result is hypersensitive to tiny perturbations in its data. However, even functions that are well-conditioned can suffer from large loss of accuracy if an algorithm <a href="/wiki/Numerical_stability" title="Numerical stability">numerically unstable</a> for that data is used: apparently equivalent formulations of expressions in a programming language can differ markedly in their numerical stability. One approach to remove the risk of such loss of accuracy is the design and analysis of numerically stable algorithms, which is an aim of the branch of mathematics known as <a href="/wiki/Numerical_analysis" title="Numerical analysis">numerical analysis</a>. Another approach that can protect against the risk of numerical instabilities is the computation of intermediate (scratch) values in an algorithm at a higher precision than the final result requires,<sup id="cite_ref-OliveiraStewart_2006_65-0" class="reference"><a href="#cite_note-OliveiraStewart_2006-65"><span class="cite-bracket">[</span>55<span class="cite-bracket">]</span></a></sup> which can remove, or reduce by orders of magnitude,<sup id="cite_ref-Kahan_2005_ARITH17_66-0" class="reference"><a href="#cite_note-Kahan_2005_ARITH17-66"><span class="cite-bracket">[</span>56<span class="cite-bracket">]</span></a></sup> such risk: <a href="/wiki/Quadruple-precision_floating-point_format" title="Quadruple-precision floating-point format">IEEE 754 quadruple precision</a> and <a href="/wiki/Extended_precision" title="Extended precision">extended precision</a> are designed for this purpose when computing at double precision.<sup id="cite_ref-Kahan_2011_Debug_67-0" class="reference"><a href="#cite_note-Kahan_2011_Debug-67"><span class="cite-bracket">[</span>57<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-NB_4_68-0" class="reference"><a href="#cite_note-NB_4-68"><span class="cite-bracket">[</span>nb 11<span class="cite-bracket">]</span></a></sup> </p><p>For example, the following algorithm is a direct implementation to compute the function <span class="texhtml"><var style="padding-right: 1px;">A</var>(<var style="padding-right: 1px;">x</var>) = (<var style="padding-right: 1px;">x</var>−1) / (exp(<var style="padding-right: 1px;">x</var>−1) − 1)</span> which is well-conditioned at 1.0,<sup id="cite_ref-NB_5_69-0" class="reference"><a href="#cite_note-NB_5-69"><span class="cite-bracket">[</span>nb 12<span class="cite-bracket">]</span></a></sup> however it can be shown to be numerically unstable and lose up to half the significant digits carried by the arithmetic when computed near 1.0.<sup id="cite_ref-Kahan_2001_JavaHurt_70-0" class="reference"><a href="#cite_note-Kahan_2001_JavaHurt-70"><span class="cite-bracket">[</span>58<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-highlight mw-highlight-lang-c mw-content-ltr mw-highlight-lines" dir="ltr"><pre><span></span><span class="linenos" data-line="1"></span><span class="kt">double</span><span class="w"> </span><span class="nf">A</span><span class="p">(</span><span class="kt">double</span><span class="w"> </span><span class="n">X</span><span class="p">)</span> <span class="linenos" data-line="2"></span><span class="p">{</span> <span class="hll"><span class="linenos" data-line="3"></span><span class="w"> </span><span class="kt">double</span><span class="w"> </span><span class="n">Y</span><span class="p">,</span><span class="w"> </span><span class="n">Z</span><span class="p">;</span><span class="w"> </span><span class="c1">// [1]</span> </span><span class="linenos" data-line="4"></span><span class="w"> </span><span class="n">Y</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">X</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mf">1.0</span><span class="p">;</span> <span class="linenos" data-line="5"></span><span class="w"> </span><span class="n">Z</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">exp</span><span class="p">(</span><span class="n">Y</span><span class="p">);</span> <span class="linenos" data-line="6"></span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">Z</span><span class="w"> </span><span class="o">!=</span><span class="w"> </span><span class="mf">1.0</span><span class="p">)</span> <span class="hll"><span class="linenos" data-line="7"></span><span class="w"> </span><span class="n">Z</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">Y</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="p">(</span><span class="n">Z</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mf">1.0</span><span class="p">);</span><span class="w"> </span><span class="c1">// [2]</span> </span><span class="linenos" data-line="8"></span><span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">Z</span><span class="p">;</span> <span class="linenos" data-line="9"></span><span class="p">}</span> </pre></div> <p>If, however, intermediate computations are all performed in extended precision (e.g. by setting line [1] to <a href="/wiki/C99" title="C99">C99</a> <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">long double</code>), then up to full precision in the final double result can be maintained.<sup id="cite_ref-NB_6_71-0" class="reference"><a href="#cite_note-NB_6-71"><span class="cite-bracket">[</span>nb 13<span class="cite-bracket">]</span></a></sup> Alternatively, a numerical analysis of the algorithm reveals that if the following non-obvious change to line [2] is made: </p> <div class="mw-highlight mw-highlight-lang-c mw-content-ltr" dir="ltr"><pre><span></span><span class="n">Z</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">log</span><span class="p">(</span><span class="n">Z</span><span class="p">)</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="p">(</span><span class="n">Z</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mf">1.0</span><span class="p">);</span> </pre></div> <p>then the algorithm becomes numerically stable and can compute to full double precision. </p><p>To maintain the properties of such carefully constructed numerically stable programs, careful handling by the <a href="/wiki/Compiler" title="Compiler">compiler</a> is required. Certain "optimizations" that compilers might make (for example, reordering operations) can work against the goals of well-behaved software. There is some controversy about the failings of compilers and language designs in this area: C99 is an example of a language where such optimizations are carefully specified to maintain numerical precision. See the external references at the bottom of this article. </p><p>A detailed treatment of the techniques for writing high-quality floating-point software is beyond the scope of this article, and the reader is referred to,<sup id="cite_ref-Higham_2002_64-1" class="reference"><a href="#cite_note-Higham_2002-64"><span class="cite-bracket">[</span>54<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Kahan_2000_Marketing_72-0" class="reference"><a href="#cite_note-Kahan_2000_Marketing-72"><span class="cite-bracket">[</span>59<span class="cite-bracket">]</span></a></sup> and the other references at the bottom of this article. Kahan suggests several rules of thumb that can substantially decrease by orders of magnitude<sup id="cite_ref-Kahan_2000_Marketing_72-1" class="reference"><a href="#cite_note-Kahan_2000_Marketing-72"><span class="cite-bracket">[</span>59<span class="cite-bracket">]</span></a></sup> the risk of numerical anomalies, in addition to, or in lieu of, a more careful numerical analysis. These include: as noted above, computing all expressions and intermediate results in the highest precision supported in hardware (a common rule of thumb is to carry twice the precision of the desired result, i.e. compute in double precision for a final single-precision result, or in double extended or quad precision for up to double-precision results<sup id="cite_ref-Kahan_1981_WhyIEEE_73-0" class="reference"><a href="#cite_note-Kahan_1981_WhyIEEE-73"><span class="cite-bracket">[</span>60<span class="cite-bracket">]</span></a></sup>); and rounding input data and results to only the precision required and supported by the input data (carrying excess precision in the final result beyond that required and supported by the input data can be misleading, increases storage cost and decreases speed, and the excess bits can affect convergence of numerical procedures:<sup id="cite_ref-Kahan_2001_LN_74-0" class="reference"><a href="#cite_note-Kahan_2001_LN-74"><span class="cite-bracket">[</span>61<span class="cite-bracket">]</span></a></sup> notably, the first form of the iterative example given below converges correctly when using this rule of thumb). Brief descriptions of several additional issues and techniques follow. </p><p>As decimal fractions can often not be exactly represented in binary floating-point, such arithmetic is at its best when it is simply being used to measure real-world quantities over a wide range of scales (such as the orbital period of a moon around Saturn or the mass of a <a href="/wiki/Proton" title="Proton">proton</a>), and at its worst when it is expected to model the interactions of quantities expressed as decimal strings that are expected to be exact.<sup id="cite_ref-Kahan_2005_ARITH17_66-1" class="reference"><a href="#cite_note-Kahan_2005_ARITH17-66"><span class="cite-bracket">[</span>56<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-Kahan_2000_Marketing_72-2" class="reference"><a href="#cite_note-Kahan_2000_Marketing-72"><span class="cite-bracket">[</span>59<span class="cite-bracket">]</span></a></sup> An example of the latter case is financial calculations. For this reason, financial software tends not to use a binary floating-point number representation.<sup id="cite_ref-Speleotrove_2012_75-0" class="reference"><a href="#cite_note-Speleotrove_2012-75"><span class="cite-bracket">[</span>62<span class="cite-bracket">]</span></a></sup> The "decimal" data type of the <a href="/wiki/C_Sharp_(programming_language)" title="C Sharp (programming language)">C#</a> and <a href="/wiki/Python_(programming_language)" title="Python (programming language)">Python</a> programming languages, and the decimal formats of the <a href="/wiki/IEEE_754-2008" class="mw-redirect" title="IEEE 754-2008">IEEE 754-2008</a> standard, are designed to avoid the problems of binary floating-point representations when applied to human-entered exact decimal values, and make the arithmetic always behave as expected when numbers are printed in decimal. </p><p>Expectations from mathematics may not be realized in the field of floating-point computation. For example, it is known that <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle (x+y)(x-y)=x^{2}-y^{2}\,}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mo stretchy="false">(</mo> <mi>x</mi> <mo>+</mo> <mi>y</mi> <mo stretchy="false">)</mo> <mo stretchy="false">(</mo> <mi>x</mi> <mo>−<!-- − --></mo> <mi>y</mi> <mo stretchy="false">)</mo> <mo>=</mo> <msup> <mi>x</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo>−<!-- − --></mo> <msup> <mi>y</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mspace width="thinmathspace"></mspace> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle (x+y)(x-y)=x^{2}-y^{2}\,}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/42f6b57467359a23ff92ad97626a078c836d5a3e" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:25.194ex; height:3.176ex;" alt="{\displaystyle (x+y)(x-y)=x^{2}-y^{2}\,}" /></span>, and that <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \sin ^{2}{\theta }+\cos ^{2}{\theta }=1\,}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <msup> <mi>sin</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> <mo>+</mo> <msup> <mi>cos</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msup> <mo>⁡<!-- --></mo> <mrow class="MJX-TeXAtom-ORD"> <mi>θ<!-- θ --></mi> </mrow> <mo>=</mo> <mn>1</mn> <mspace width="thinmathspace"></mspace> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \sin ^{2}{\theta }+\cos ^{2}{\theta }=1\,}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a9d6c234a50e2ecda8445a62785f9c27167a4d34" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.505ex; width:18.519ex; height:2.843ex;" alt="{\displaystyle \sin ^{2}{\theta }+\cos ^{2}{\theta }=1\,}" /></span>, however these facts cannot be relied on when the quantities involved are the result of floating-point computation. </p><p>The use of the equality test (<code>if (x==y) ...</code>) requires care when dealing with floating-point numbers. Even simple expressions like <code>0.6/0.2-3==0</code> will, on most computers, fail to be true<sup id="cite_ref-Christiansen_Perl_76-0" class="reference"><a href="#cite_note-Christiansen_Perl-76"><span class="cite-bracket">[</span>63<span class="cite-bracket">]</span></a></sup> (in IEEE 754 double precision, for example, <code>0.6/0.2 - 3</code> is approximately equal to <span class="nowrap"><span data-sort-value="3015555910790149937♠"></span>−4.440<span style="margin-left:.25em;">892</span><span style="margin-left:.25em;">098</span><span style="margin-left:.25em;">500</span><span style="margin-left:.25em;">63</span><span style="margin-left:0.25em;margin-right:0.15em;">×</span>10<sup>−16</sup></span>). Consequently, such tests are sometimes replaced with "fuzzy" comparisons (<code>if (abs(x-y) < epsilon) ...</code>, where epsilon is sufficiently small and tailored to the application, such as 1.0E−13). The wisdom of doing this varies greatly, and can require numerical analysis to bound epsilon.<sup id="cite_ref-Higham_2002_64-2" class="reference"><a href="#cite_note-Higham_2002-64"><span class="cite-bracket">[</span>54<span class="cite-bracket">]</span></a></sup> Values derived from the primary data representation and their comparisons should be performed in a wider, extended, precision to minimize the risk of such inconsistencies due to round-off errors.<sup id="cite_ref-Kahan_2000_Marketing_72-3" class="reference"><a href="#cite_note-Kahan_2000_Marketing-72"><span class="cite-bracket">[</span>59<span class="cite-bracket">]</span></a></sup> It is often better to organize the code in such a way that such tests are unnecessary. For example, in <a href="/wiki/Computational_geometry" title="Computational geometry">computational geometry</a>, exact tests of whether a point lies off or on a line or plane defined by other points can be performed using adaptive precision or exact arithmetic methods.<sup id="cite_ref-Shewchuk_77-0" class="reference"><a href="#cite_note-Shewchuk-77"><span class="cite-bracket">[</span>64<span class="cite-bracket">]</span></a></sup> </p><p>Small errors in floating-point arithmetic can grow when mathematical algorithms perform operations an enormous number of times. A few examples are <a href="/wiki/Matrix_inversion" class="mw-redirect" title="Matrix inversion">matrix inversion</a>, <a href="/wiki/Eigenvector" class="mw-redirect" title="Eigenvector">eigenvector</a> computation, and differential equation solving. These algorithms must be very carefully designed, using numerical approaches such as <a href="/wiki/Iterative_refinement" title="Iterative refinement">iterative refinement</a>, if they are to work well.<sup id="cite_ref-Kahan_1997_Cantilever_78-0" class="reference"><a href="#cite_note-Kahan_1997_Cantilever-78"><span class="cite-bracket">[</span>65<span class="cite-bracket">]</span></a></sup> </p><p>Summation of a vector of floating-point values is a basic algorithm in <a href="/wiki/Computational_science" title="Computational science">scientific computing</a>, and so an awareness of when loss of significance can occur is essential. For example, if one is adding a very large number of numbers, the individual addends are very small compared with the sum. This can lead to loss of significance. A typical addition would then be something like </p> <pre>3253.671 + 3.141276 ----------- 3256.812 </pre> <p>The low 3 digits of the addends are effectively lost. Suppose, for example, that one needs to add many numbers, all approximately equal to 3. After 1000 of them have been added, the running sum is about 3000; the lost digits are not regained. The <a href="/wiki/Kahan_summation_algorithm" title="Kahan summation algorithm">Kahan summation algorithm</a> may be used to reduce the errors.<sup id="cite_ref-Higham_2002_64-3" class="reference"><a href="#cite_note-Higham_2002-64"><span class="cite-bracket">[</span>54<span class="cite-bracket">]</span></a></sup> </p><p>Round-off error can affect the convergence and accuracy of iterative numerical procedures. As an example, <a href="/wiki/Archimedes" title="Archimedes">Archimedes</a> approximated π by calculating the perimeters of polygons inscribing and circumscribing a circle, starting with hexagons, and successively doubling the number of sides. As noted above, computations may be rearranged in a way that is mathematically equivalent but less prone to error (<a href="/wiki/Numerical_analysis" title="Numerical analysis">numerical analysis</a>). Two forms of the recurrence formula for the circumscribed polygon are:<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="Not obvious formulas (June 2016)">citation needed</span></a></i>]</sup> </p> <ul><li><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle t_{0}={\frac {1}{\sqrt {3}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <msub> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mn>0</mn> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mn>1</mn> <msqrt> <mn>3</mn> </msqrt> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle t_{0}={\frac {1}{\sqrt {3}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/a9a8d68112b0a6f1a41b69213bdc887a4f8bae7d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.838ex; width:8.019ex; height:4.176ex;" alt="{\textstyle t_{0}={\frac {1}{\sqrt {3}}}}" /></span></li> <li>First form: <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle t_{i+1}={\frac {{\sqrt {t_{i}^{2}+1}}-1}{t_{i}}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <msub> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <mrow> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <msubsup> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msubsup> <mo>+</mo> <mn>1</mn> </msqrt> </mrow> <mo>−<!-- − --></mo> <mn>1</mn> </mrow> <msub> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle t_{i+1}={\frac {{\sqrt {t_{i}^{2}+1}}-1}{t_{i}}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/7b8e42962e073104445e5fb184833851ef480065" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -1.505ex; width:14.944ex; height:5.843ex;" alt="{\textstyle t_{i+1}={\frac {{\sqrt {t_{i}^{2}+1}}-1}{t_{i}}}}" /></span></li> <li>Second form: <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\textstyle t_{i+1}={\frac {t_{i}}{{\sqrt {t_{i}^{2}+1}}+1}}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="false" scriptlevel="0"> <msub> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> <mo>+</mo> <mn>1</mn> </mrow> </msub> <mo>=</mo> <mrow class="MJX-TeXAtom-ORD"> <mfrac> <msub> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> <mrow> <mrow class="MJX-TeXAtom-ORD"> <msqrt> <msubsup> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> <mrow class="MJX-TeXAtom-ORD"> <mn>2</mn> </mrow> </msubsup> <mo>+</mo> <mn>1</mn> </msqrt> </mrow> <mo>+</mo> <mn>1</mn> </mrow> </mfrac> </mrow> </mstyle> </mrow> <annotation encoding="application/x-tex">{\textstyle t_{i+1}={\frac {t_{i}}{{\sqrt {t_{i}^{2}+1}}+1}}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/171333bbf207a0be4a87530af334d931d755e13d" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -3.171ex; width:14.944ex; height:5.843ex;" alt="{\textstyle t_{i+1}={\frac {t_{i}}{{\sqrt {t_{i}^{2}+1}}+1}}}" /></span></li> <li><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle \pi \sim 6\times 2^{i}\times t_{i}}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>π<!-- π --></mi> <mo>∼<!-- ∼ --></mo> <mn>6</mn> <mo>×<!-- × --></mo> <msup> <mn>2</mn> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msup> <mo>×<!-- × --></mo> <msub> <mi>t</mi> <mrow class="MJX-TeXAtom-ORD"> <mi>i</mi> </mrow> </msub> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle \pi \sim 6\times 2^{i}\times t_{i}}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/c19a2c9acbd64b8495a8e1aa2fe018103c500d61" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.671ex; width:14.875ex; height:3.009ex;" alt="{\displaystyle \pi \sim 6\times 2^{i}\times t_{i}}" /></span>, converging as <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle i\rightarrow \infty }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>i</mi> <mo stretchy="false">→<!-- → --></mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle i\rightarrow \infty }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9912b98084f3a5fa477e9fbe25597750d2c375fd" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:6.74ex; height:2.176ex;" alt="{\displaystyle i\rightarrow \infty }" /></span></li></ul> <p>Here is a computation using IEEE "double" (a significand with 53 bits of precision) arithmetic: </p> <pre> i 6 × 2<sup>i</sup> × t<sub>i</sub>, first form 6 × 2<sup>i</sup> × t<sub>i</sub>, second form --------------------------------------------------------- 0 <b><span style="color:purple;">3</span></b>.4641016151377543863 <b><span style="color:purple;">3</span></b>.4641016151377543863 1 <b><span style="color:purple;">3</span></b>.2153903091734710173 <b><span style="color:purple;">3</span></b>.2153903091734723496 2 <b><span style="color:purple;">3.1</span></b>596599420974940120 <b><span style="color:purple;">3.1</span></b>596599420975006733 3 <b><span style="color:purple;">3.14</span></b>60862151314012979 <b><span style="color:purple;">3.14</span></b>60862151314352708 4 <b><span style="color:purple;">3.14</span></b>27145996453136334 <b><span style="color:purple;">3.14</span></b>27145996453689225 5 <b><span style="color:purple;">3.141</span></b>8730499801259536 <b><span style="color:purple;">3.141</span></b>8730499798241950 6 <b><span style="color:purple;">3.141</span></b>6627470548084133 <b><span style="color:purple;">3.141</span></b>6627470568494473 7 <b><span style="color:purple;">3.141</span></b>6101765997805905 <b><span style="color:purple;">3.141</span></b>6101766046906629 8 <b><span style="color:purple;">3.14159</span></b>70343230776862 <b><span style="color:purple;">3.14159</span></b>70343215275928 9 <b><span style="color:purple;">3.14159</span></b>37488171150615 <b><span style="color:purple;">3.14159</span></b>37487713536668 10 <b><span style="color:purple;">3.141592</span></b>9278733740748 <b><span style="color:purple;">3.141592</span></b>9273850979885 11 <b><span style="color:purple;">3.141592</span></b>7256228504127 <b><span style="color:purple;">3.141592</span></b>7220386148377 12 <b><span style="color:purple;">3.1415926</span></b>717412858693 <b><span style="color:purple;">3.1415926</span></b>707019992125 13 <b><span style="color:purple;">3.1415926</span></b>189011456060 <b><span style="color:purple;">3.14159265</span></b>78678454728 14 <b><span style="color:purple;">3.1415926</span></b>717412858693 <b><span style="color:purple;">3.14159265</span></b>46593073709 15 <b><span style="color:purple;">3.14159</span></b>19358822321783 <b><span style="color:purple;">3.141592653</span></b>8571730119 16 <b><span style="color:purple;">3.1415926</span></b>717412858693 <b><span style="color:purple;">3.141592653</span></b>6566394222 17 <b><span style="color:purple;">3.1415</span></b>810075796233302 <b><span style="color:purple;">3.141592653</span></b>6065061913 18 <b><span style="color:purple;">3.1415926</span></b>717412858693 <b><span style="color:purple;">3.1415926535</span></b>939728836 19 <b><span style="color:purple;">3.141</span></b>4061547378810956 <b><span style="color:purple;">3.1415926535</span></b>908393901 20 <b><span style="color:purple;">3.14</span></b>05434924008406305 <b><span style="color:purple;">3.1415926535</span></b>900560168 21 <b><span style="color:purple;">3.14</span></b>00068646912273617 <b><span style="color:purple;">3.141592653589</span></b>8608396 22 <b><span style="color:purple;">3.1</span></b>349453756585929919 <b><span style="color:purple;">3.141592653589</span></b>8122118 23 <b><span style="color:purple;">3.14</span></b>00068646912273617 <b><span style="color:purple;">3.14159265358979</span></b>95552 24 <b><span style="color:purple;">3</span></b>.2245152435345525443 <b><span style="color:purple;">3.14159265358979</span></b>68907 25 <b><span style="color:purple;">3.14159265358979</span></b>62246 26 <b><span style="color:purple;">3.14159265358979</span></b>62246 27 <b><span style="color:purple;">3.14159265358979</span></b>62246 28 <b><span style="color:purple;">3.14159265358979</span></b>62246 The true value is <b><span style="color:purple;">3.14159265358979323846264338327...</span></b> </pre> <p>While the two forms of the recurrence formula are clearly mathematically equivalent,<sup id="cite_ref-NB_7_79-0" class="reference"><a href="#cite_note-NB_7-79"><span class="cite-bracket">[</span>nb 14<span class="cite-bracket">]</span></a></sup> the first subtracts 1 from a number extremely close to 1, leading to an increasingly problematic loss of <a href="/wiki/Significant_digit" class="mw-redirect" title="Significant digit">significant digits</a>. As the recurrence is applied repeatedly, the accuracy improves at first, but then it deteriorates. It never gets better than about 8 digits, even though 53-bit arithmetic should be capable of about 16 digits of precision. When the second form of the recurrence is used, the value converges to 15 digits of precision. </p> <div class="mw-heading mw-heading3"><h3 id=""Fast_math"_optimization"><span id=".22Fast_math.22_optimization"></span>"Fast math" optimization</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=22" title="Edit section: "Fast math" optimization"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The aforementioned lack of <a href="/wiki/Associative_property" title="Associative property">associativity</a> of floating-point operations in general means that <a href="/wiki/Compilers" class="mw-redirect" title="Compilers">compilers</a> cannot as effectively reorder arithmetic expressions as they could with integer and fixed-point arithmetic, presenting a roadblock in optimizations such as <a href="/wiki/Common_subexpression_elimination" title="Common subexpression elimination">common subexpression elimination</a> and auto-<a href="/wiki/Single_instruction,_multiple_data" title="Single instruction, multiple data">vectorization</a>.<sup id="cite_ref-Vectorizers_80-0" class="reference"><a href="#cite_note-Vectorizers-80"><span class="cite-bracket">[</span>66<span class="cite-bracket">]</span></a></sup> The "fast math" option on many compilers (ICC, GCC, Clang, MSVC...) turns on reassociation along with unsafe assumptions such as a lack of NaN and infinite numbers in IEEE 754. Some compilers also offer more granular options to only turn on reassociation. In either case, the programmer is exposed to many of the precision pitfalls mentioned above for the portion of the program using "fast" math.<sup id="cite_ref-FPM_81-0" class="reference"><a href="#cite_note-FPM-81"><span class="cite-bracket">[</span>67<span class="cite-bracket">]</span></a></sup> </p><p>In some compilers (GCC and Clang), turning on "fast" math may cause the program to <a href="/wiki/Subnormal_number#Disabling_subnormal_floats_at_the_code_level" title="Subnormal number">disable subnormal floats</a> at startup, affecting the floating-point behavior of not only the generated code, but also any program using such code as a <a href="/wiki/Library_(computing)" title="Library (computing)">library</a>.<sup id="cite_ref-harmful_82-0" class="reference"><a href="#cite_note-harmful-82"><span class="cite-bracket">[</span>68<span class="cite-bracket">]</span></a></sup> </p><p>In most <a href="/wiki/Fortran" title="Fortran">Fortran</a> compilers, as allowed by the ISO/IEC 1539-1:2004 Fortran standard, reassociation is the default, with breakage largely prevented by the "protect parens" setting (also on by default). This setting stops the compiler from reassociating beyond the boundaries of parentheses.<sup id="cite_ref-Gen_83-0" class="reference"><a href="#cite_note-Gen-83"><span class="cite-bracket">[</span>69<span class="cite-bracket">]</span></a></sup> <a href="/wiki/Intel_Fortran_Compiler" title="Intel Fortran Compiler">Intel Fortran Compiler</a> is a notable outlier.<sup id="cite_ref-zheevd_84-0" class="reference"><a href="#cite_note-zheevd-84"><span class="cite-bracket">[</span>70<span class="cite-bracket">]</span></a></sup> </p><p>A common problem in "fast" math is that subexpressions may not be optimized identically from place to place, leading to unexpected differences. One interpretation of the issue is that "fast" math as implemented currently has a poorly defined semantics. One attempt at formalizing "fast" math optimizations is seen in <i>Icing</i>, a verified compiler.<sup id="cite_ref-Becker-Darulova-Myreen-Tatlock_2019_85-0" class="reference"><a href="#cite_note-Becker-Darulova-Myreen-Tatlock_2019-85"><span class="cite-bracket">[</span>71<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=23" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1184024115">.mw-parser-output .div-col{margin-top:0.3em;column-width:30em}.mw-parser-output .div-col-small{font-size:90%}.mw-parser-output .div-col-rules{column-rule:1px solid #aaa}.mw-parser-output .div-col dl,.mw-parser-output .div-col ol,.mw-parser-output .div-col ul{margin-top:0}.mw-parser-output .div-col li,.mw-parser-output .div-col dd{page-break-inside:avoid;break-inside:avoid-column}</style><div class="div-col" style="column-width: 20em;"> <ul><li><a href="/wiki/Arbitrary-precision_arithmetic" title="Arbitrary-precision arithmetic">Arbitrary-precision arithmetic</a></li> <li><a href="/wiki/C99#IEEE_754_floating-point_support" title="C99">C99</a> for code examples demonstrating access and use of IEEE 754 features.</li> <li><a href="/wiki/Computable_number" title="Computable number">Computable number</a></li> <li><a href="/wiki/Coprocessor" title="Coprocessor">Coprocessor</a></li> <li><a href="/wiki/Decimal_floating_point" title="Decimal floating point">Decimal floating point</a></li> <li><a href="/wiki/Double-precision_floating-point_format" title="Double-precision floating-point format">Double-precision floating-point format</a></li> <li><a href="/wiki/Experimental_mathematics" title="Experimental mathematics">Experimental mathematics</a> – utilizes high precision floating-point computations</li> <li><a href="/wiki/Fixed-point_arithmetic" title="Fixed-point arithmetic">Fixed-point arithmetic</a></li> <li><a href="/wiki/Floating-point_error_mitigation" title="Floating-point error mitigation">Floating-point error mitigation</a></li> <li><a href="/wiki/FLOPS" class="mw-redirect" title="FLOPS">FLOPS</a></li> <li><a href="/wiki/Gal%27s_accurate_tables" title="Gal's accurate tables">Gal's accurate tables</a></li> <li><a href="/wiki/GNU_MPFR" title="GNU MPFR">GNU MPFR</a></li> <li><a href="/wiki/Half-precision_floating-point_format" title="Half-precision floating-point format">Half-precision floating-point format</a></li> <li><a href="/wiki/IEEE_754" title="IEEE 754">IEEE 754</a> – Standard for Binary Floating-Point Arithmetic</li> <li><a href="/wiki/IBM_hexadecimal_floating-point" title="IBM hexadecimal floating-point">IBM Floating Point Architecture</a></li> <li><a href="/wiki/Kahan_summation_algorithm" title="Kahan summation algorithm">Kahan summation algorithm</a></li> <li><a href="/wiki/Microsoft_Binary_Format" title="Microsoft Binary Format">Microsoft Binary Format</a> (MBF)</li> <li><a href="/wiki/Minifloat" title="Minifloat">Minifloat</a></li> <li><a href="/wiki/Q_(number_format)" title="Q (number format)">Q (number format)</a> for constant resolution</li> <li><a href="/wiki/Quadruple-precision_floating-point_format" title="Quadruple-precision floating-point format">Quadruple-precision floating-point format</a> (including double-double)</li> <li><a href="/wiki/Significant_figures" title="Significant figures">Significant figures</a></li> <li><a href="/wiki/Single-precision_floating-point_format" title="Single-precision floating-point format">Single-precision floating-point format</a></li> <li><a href="/wiki/Standard_Apple_Numerics_Environment" title="Standard Apple Numerics Environment">Standard Apple Numerics Environment (SANE)</a></li></ul> </div> <div class="mw-heading mw-heading2"><h2 id="Notes">Notes</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=24" title="Edit section: Notes"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap mw-references-columns"><ol class="references"> <li id="cite_note-NB_Significand-4"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_Significand_4-0">^</a></b></span> <span class="reference-text"><span class="anchor" id="NB-Significand"></span>The <i><a href="/wiki/Significand" title="Significand">significand</a></i> of a floating-point number is also called <i>mantissa</i> by some authors—not to be confused with the <a href="/wiki/Mantissa_(logarithm)" class="mw-redirect" title="Mantissa (logarithm)">mantissa</a> of a <a href="/wiki/Logarithm" title="Logarithm">logarithm</a>. Somewhat vague, terms such as <i>coefficient</i> or <i>argument</i> are also used by some. The usage of the term <i>fraction</i> by some authors is potentially misleading as well. The term <i>characteristic</i> (as used e.g. by <a href="/wiki/Control_Data_Corporation" title="Control Data Corporation">CDC</a>) is ambiguous, as it was historically also used to specify some form of <a href="#NB-Exponent">exponent</a> of floating-point numbers.</span> </li> <li id="cite_note-NB_Exponent-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_Exponent_5-0">^</a></b></span> <span class="reference-text"><span class="anchor" id="NB-Exponent"></span>The <i><a href="/wiki/Exponent" class="mw-redirect" title="Exponent">exponent</a></i> of a floating-point number is sometimes also referred to as <i>scale</i>. The term <i>characteristic</i> (for <i><a href="/wiki/Biased_exponent" class="mw-redirect" title="Biased exponent">biased exponent</a></i>, <i>exponent bias</i>, or <i>excess n representation</i>) is ambiguous, as it was historically also used to specify the <a href="#NB-Significand">significand</a> of floating-point numbers.</span> </li> <li id="cite_note-NB_9-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_9_8-0">^</a></b></span> <span class="reference-text"><a href="/wiki/Hexadecimal_floating-point" class="mw-redirect" title="Hexadecimal floating-point">Hexadecimal (base-16) floating-point</a> arithmetic is used in the <a href="/wiki/IBM_System_360" class="mw-redirect" title="IBM System 360">IBM System 360</a> (1964) and <a href="/wiki/IBM_System_370" class="mw-redirect" title="IBM System 370">370</a> (1970) as well as various newer IBM machines, in the <a href="/wiki/RCA_Spectra_70" title="RCA Spectra 70">RCA Spectra 70</a> (1964), the Siemens 4004 (1965), 7.700 (1974), 7.800, 7.500 (1977) series mainframes and successors, the Unidata 7.000 series mainframes, the <a href="/wiki/Manchester_MU5" class="mw-redirect" title="Manchester MU5">Manchester MU5</a> (1972), the <a href="/wiki/Heterogeneous_Element_Processor" title="Heterogeneous Element Processor">HEP</a> (1982) computers, and in 360/370-compatible mainframe families made by Fujitsu, Amdahl and Hitachi. It is also used in the <a href="/wiki/Illinois_ILLIAC_III" class="mw-redirect" title="Illinois ILLIAC III">Illinois ILLIAC III</a> (1966), <a href="/wiki/Data_General_Eclipse_S/200" class="mw-redirect" title="Data General Eclipse S/200">Data General Eclipse S/200</a> (ca. 1974), <a href="/wiki/Gould_Powernode_9080" class="mw-redirect" title="Gould Powernode 9080">Gould Powernode 9080</a> (1980s), <a href="/wiki/Interdata_8/32" class="mw-redirect" title="Interdata 8/32">Interdata 8/32</a> (1970s), the <a href="/w/index.php?title=SEL_System_85&action=edit&redlink=1" class="new" title="SEL System 85 (page does not exist)">SEL Systems 85</a> and <a href="/w/index.php?title=SEL_System_86&action=edit&redlink=1" class="new" title="SEL System 86 (page does not exist)">86</a> as well as the <a href="/wiki/SDS_Sigma_5" class="mw-redirect" title="SDS Sigma 5">SDS Sigma 5</a> (1967), <a href="/wiki/SDS_Sigma_7" class="mw-redirect" title="SDS Sigma 7">7</a> (1966) and <a href="/wiki/Xerox_Sigma_9" title="Xerox Sigma 9">Xerox Sigma 9</a> (1970).</span> </li> <li id="cite_note-NB_8-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_8_10-0">^</a></b></span> <span class="reference-text">Octal (base-8) floating-point arithmetic is used in the <a href="/wiki/Ferranti_Atlas" class="mw-redirect" title="Ferranti Atlas">Ferranti Atlas</a> (1962), <a href="/wiki/Burroughs_B5500" class="mw-redirect" title="Burroughs B5500">Burroughs B5500</a> (1964), <a href="/wiki/Burroughs_B5700" class="mw-redirect" title="Burroughs B5700">Burroughs B5700</a> (1971), <a href="/wiki/Burroughs_B6700" class="mw-redirect" title="Burroughs B6700">Burroughs B6700</a> (1971) and <a href="/wiki/Burroughs_B7700" class="mw-redirect" title="Burroughs B7700">Burroughs B7700</a> (1972) computers.</span> </li> <li id="cite_note-NB_11-12"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_11_12-0">^</a></b></span> <span class="reference-text">Quaternary (base-4) floating-point arithmetic is used in the <a href="/wiki/Illinois_ILLIAC_II" class="mw-redirect" title="Illinois ILLIAC II">Illinois ILLIAC II</a> (1962) computer. It is also used in the Digital Field System DFS IV and V high-resolution site survey systems.</span> </li> <li id="cite_note-NB_12-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_12_13-0">^</a></b></span> <span class="reference-text">Base-256 floating-point arithmetic is used in the <a href="/wiki/Rice_Institute_R1" class="mw-redirect" title="Rice Institute R1">Rice Institute R1</a> computer (since 1958).</span> </li> <li id="cite_note-NB_10-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_10_15-0">^</a></b></span> <span class="reference-text">Base-65536 floating-point arithmetic is used in the <a href="/wiki/MANIAC_II" title="MANIAC II">MANIAC II</a> (1956) computer.</span> </li> <li id="cite_note-NB_1-41"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_1_41-0">^</a></b></span> <span class="reference-text">Computer hardware does not necessarily compute the exact value; it simply has to produce the equivalent rounded result as though it had computed the infinitely precise result.</span> </li> <li id="cite_note-NB_2-54"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_2_54-0">^</a></b></span> <span class="reference-text">The enormous complexity of modern <a href="/wiki/Division_algorithm" title="Division algorithm">division algorithms</a> once led to a famous error. An early version of the <a href="/wiki/Intel_Pentium" class="mw-redirect" title="Intel Pentium">Intel Pentium</a> chip was shipped with a <a href="/wiki/FDIV" class="mw-redirect" title="FDIV">division instruction</a> that, on rare occasions, gave slightly incorrect results. Many computers had been shipped before the error was discovered. Until the defective computers were replaced, patched versions of compilers were developed that could avoid the failing cases. See <i><a href="/wiki/Pentium_FDIV_bug" title="Pentium FDIV bug">Pentium FDIV bug</a></i>.</span> </li> <li id="cite_note-NB_3-57"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_3_57-0">^</a></b></span> <span class="reference-text">But an attempted computation of cos(π) yields −1 exactly. Since the derivative is nearly zero near π, the effect of the inaccuracy in the argument is far smaller than the spacing of the floating-point numbers around −1, and the rounded result is exact.</span> </li> <li id="cite_note-NB_4-68"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_4_68-0">^</a></b></span> <span class="reference-text"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">William Kahan</a> notes: "Except in extremely uncommon situations, extra-precise arithmetic generally attenuates risks due to roundoff at far less cost than the price of a competent error-analyst."</span> </li> <li id="cite_note-NB_5-69"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_5_69-0">^</a></b></span> <span class="reference-text">The <a href="/wiki/Taylor_expansion" class="mw-redirect" title="Taylor expansion">Taylor expansion</a> of this function demonstrates that it is well-conditioned near 1: A(x) = 1 − (x−1)/2 + (x−1)^2/12 − (x−1)^4/720 + (x−1)^6/30240 − (x−1)^8/1209600 + ... for |x−1| < π.</span> </li> <li id="cite_note-NB_6-71"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_6_71-0">^</a></b></span> <span class="reference-text">If <a href="/wiki/Long_double" title="Long double">long double</a> is <a href="/wiki/IEEE_quad_precision" class="mw-redirect" title="IEEE quad precision">IEEE quad precision</a> then full double precision is retained; if long double is <a href="/wiki/IEEE_double_extended_precision" class="mw-redirect" title="IEEE double extended precision">IEEE double extended precision</a> then additional, but not full precision is retained.</span> </li> <li id="cite_note-NB_7-79"><span class="mw-cite-backlink"><b><a href="#cite_ref-NB_7_79-0">^</a></b></span> <span class="reference-text">The equivalence of the two forms can be verified algebraically by noting that the <a href="/wiki/Denominator" class="mw-redirect" title="Denominator">denominator</a> of the fraction in the second form is the <a href="/wiki/Conjugate_(algebra)" class="mw-redirect" title="Conjugate (algebra)">conjugate</a> of the <a href="/wiki/Numerator" class="mw-redirect" title="Numerator">numerator</a> of the first. By multiplying the top and bottom of the first expression by this conjugate, one obtains the second expression.</span> </li> </ol></div></div> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=25" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239543626" /><div class="reflist reflist-columns references-column-width" style="column-width: 30em;"> <ol class="references"> <li id="cite_note-Muller_2010-1"><span class="mw-cite-backlink">^ <a href="#cite_ref-Muller_2010_1-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Muller_2010_1-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Muller_2010_1-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-Muller_2010_1-3"><sup><i><b>d</b></i></sup></a> <a href="#cite_ref-Muller_2010_1-4"><sup><i><b>e</b></i></sup></a> <a href="#cite_ref-Muller_2010_1-5"><sup><i><b>f</b></i></sup></a></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite id="muller_et_al_pg_16" class="citation book cs1">Muller, Jean-Michel; Brisebarre, Nicolas; de Dinechin, Florent; Jeannerod, Claude-Pierre; Lefèvre, Vincent; Melquiond, Guillaume; <a href="/wiki/Nathalie_Revol" title="Nathalie Revol">Revol, Nathalie</a>; Stehlé, Damien; Torres, Serge (2010). <a rel="nofollow" class="external text" href="https://books.google.com/books?id=baFvrIOPvncC&pg=PA16"><i>Handbook of Floating-Point Arithmetic</i></a> (1st ed.). <a href="/wiki/Birkh%C3%A4user" title="Birkhäuser">Birkhäuser</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-0-8176-4705-6">10.1007/978-0-8176-4705-6</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-8176-4704-9" title="Special:BookSources/978-0-8176-4704-9"><bdi>978-0-8176-4704-9</bdi></a>. <a href="/wiki/LCCN_(identifier)" class="mw-redirect" title="LCCN (identifier)">LCCN</a> <a rel="nofollow" class="external text" href="https://lccn.loc.gov/2009939668">2009939668</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Handbook+of+Floating-Point+Arithmetic&rft.edition=1st&rft.pub=Birkh%C3%A4user&rft.date=2010&rft_id=info%3Alccn%2F2009939668&rft_id=info%3Adoi%2F10.1007%2F978-0-8176-4705-6&rft.isbn=978-0-8176-4704-9&rft.aulast=Muller&rft.aufirst=Jean-Michel&rft.au=Brisebarre%2C+Nicolas&rft.au=de+Dinechin%2C+Florent&rft.au=Jeannerod%2C+Claude-Pierre&rft.au=Lef%C3%A8vre%2C+Vincent&rft.au=Melquiond%2C+Guillaume&rft.au=Revol%2C+Nathalie&rft.au=Stehl%C3%A9%2C+Damien&rft.au=Torres%2C+Serge&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DbaFvrIOPvncC%26pg%3DPA16&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-sterbenz1974fpcomp-2"><span class="mw-cite-backlink">^ <a href="#cite_ref-sterbenz1974fpcomp_2-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-sterbenz1974fpcomp_2-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSterbenz1974" class="citation book cs1">Sterbenz, Pat H. (1974). <a rel="nofollow" class="external text" href="https://archive.org/details/SterbenzFloatingPointComputation/mode/2up"><i>Floating-Point Computation</i></a>. Englewood Cliffs, NJ, United States: Prentice-Hall. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/0-13-322495-3" title="Special:BookSources/0-13-322495-3"><bdi>0-13-322495-3</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Floating-Point+Computation&rft.place=Englewood+Cliffs%2C+NJ%2C+United+States&rft.pub=Prentice-Hall&rft.date=1974&rft.isbn=0-13-322495-3&rft.aulast=Sterbenz&rft.aufirst=Pat+H.&rft_id=https%3A%2F%2Farchive.org%2Fdetails%2FSterbenzFloatingPointComputation%2Fmode%2F2up&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Smith_1997-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-Smith_1997_3-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSmith1997" class="citation book cs1">Smith, Steven W. (1997). <a rel="nofollow" class="external text" href="http://www.dspguide.com/ch28/4.htm">"Chapter 28, Fixed versus Floating Point"</a>. <i>The Scientist and Engineer's Guide to Digital Signal Processing</i>. California Technical Pub. p. 514. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-9660176-3-2" title="Special:BookSources/978-0-9660176-3-2"><bdi>978-0-9660176-3-2</bdi></a><span class="reference-accessdate">. Retrieved <span class="nowrap">2012-12-31</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Chapter+28%2C+Fixed+versus+Floating+Point&rft.btitle=The+Scientist+and+Engineer%27s+Guide+to+Digital+Signal+Processing&rft.pages=514&rft.pub=California+Technical+Pub&rft.date=1997&rft.isbn=978-0-9660176-3-2&rft.aulast=Smith&rft.aufirst=Steven+W.&rft_id=http%3A%2F%2Fwww.dspguide.com%2Fch28%2F4.htm&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Zehendner_2008-6"><span class="mw-cite-backlink">^ <a href="#cite_ref-Zehendner_2008_6-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Zehendner_2008_6-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFZehendner2008" class="citation web cs1 cs1-prop-foreign-lang-source">Zehendner, Eberhard (Summer 2008). <a rel="nofollow" class="external text" href="https://users.fmi.uni-jena.de/~nez/rechnerarithmetik_5/folien/Rechnerarithmetik.2008.05.handout.pdf">"Rechnerarithmetik: Fest- und Gleitkommasysteme"</a> <span class="cs1-format">(PDF)</span> (Lecture script) (in German). <a href="/wiki/Friedrich-Schiller-Universit%C3%A4t_Jena" class="mw-redirect" title="Friedrich-Schiller-Universität Jena">Friedrich-Schiller-Universität Jena</a>. p. 2. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180807062449/https://users.fmi.uni-jena.de/~nez/rechnerarithmetik_5/folien/Rechnerarithmetik.2008.05.handout.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2018-08-07<span class="reference-accessdate">. Retrieved <span class="nowrap">2018-08-07</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Rechnerarithmetik%3A+Fest-+und+Gleitkommasysteme&rft.pages=2&rft.pub=Friedrich-Schiller-Universit%C3%A4t+Jena&rft.date=2008&rft.aulast=Zehendner&rft.aufirst=Eberhard&rft_id=https%3A%2F%2Fusers.fmi.uni-jena.de%2F~nez%2Frechnerarithmetik_5%2Ffolien%2FRechnerarithmetik.2008.05.handout.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> <a rel="nofollow" class="external autonumber" href="https://web.archive.org/web/20180806175620/https://users.fmi.uni-jena.de/~nez/rechnerarithmetik_5/folien/Rechnerarithmetik.2008.komplett.pdf">[1]</a> (NB. This reference incorrectly gives the MANIAC II's floating point base as 256, whereas it actually is 65536.)</span> </li> <li id="cite_note-Beebe_2017-7"><span class="mw-cite-backlink">^ <a href="#cite_ref-Beebe_2017_7-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Beebe_2017_7-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Beebe_2017_7-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-Beebe_2017_7-3"><sup><i><b>d</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBeebe2017" class="citation book cs1">Beebe, Nelson H. F. (2017-08-22). "Chapter H. Historical floating-point architectures". <i>The Mathematical-Function Computation Handbook - Programming Using the MathCW Portable Software Library</i> (1st ed.). Salt Lake City, UT, USA: <a href="/wiki/Springer_International_Publishing_AG" class="mw-redirect" title="Springer International Publishing AG">Springer International Publishing AG</a>. p. 948. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-3-319-64110-2">10.1007/978-3-319-64110-2</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-319-64109-6" title="Special:BookSources/978-3-319-64109-6"><bdi>978-3-319-64109-6</bdi></a>. <a href="/wiki/LCCN_(identifier)" class="mw-redirect" title="LCCN (identifier)">LCCN</a> <a rel="nofollow" class="external text" href="https://lccn.loc.gov/2017947446">2017947446</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:30244721">30244721</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Chapter+H.+Historical+floating-point+architectures&rft.btitle=The+Mathematical-Function+Computation+Handbook+-+Programming+Using+the+MathCW+Portable+Software+Library&rft.place=Salt+Lake+City%2C+UT%2C+USA&rft.pages=948&rft.edition=1st&rft.pub=Springer+International+Publishing+AG&rft.date=2017-08-22&rft_id=info%3Adoi%2F10.1007%2F978-3-319-64110-2&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A30244721%23id-name%3DS2CID&rft_id=info%3Alccn%2F2017947446&rft.isbn=978-3-319-64109-6&rft.aulast=Beebe&rft.aufirst=Nelson+H.+F.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Savard_2018-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-Savard_2018_9-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSavard2018" class="citation cs2">Savard, John J. G. (2018) [2007], <a rel="nofollow" class="external text" href="http://www.quadibloc.com/comp/cp020302.htm">"The Decimal Floating-Point Standard"</a>, <i>quadibloc</i>, <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180703002322/http://www.quadibloc.com/comp/cp020302.htm">archived</a> from the original on 2018-07-03<span class="reference-accessdate">, retrieved <span class="nowrap">2018-07-16</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=quadibloc&rft.atitle=The+Decimal+Floating-Point+Standard&rft.date=2018&rft.aulast=Savard&rft.aufirst=John+J.+G.&rft_id=http%3A%2F%2Fwww.quadibloc.com%2Fcomp%2Fcp020302.htm&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Parkinson_2000-11"><span class="mw-cite-backlink"><b><a href="#cite_ref-Parkinson_2000_11-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFParkinson2000" class="citation book cs1">Parkinson, Roger (2000-12-07). <a rel="nofollow" class="external text" href="https://books.google.com/books?id=Ocip5vpLD4wC&pg=PA24">"Chapter 2 - High resolution digital site survey systems - Chapter 2.1 - Digital field recording systems"</a>. <i>High Resolution Site Surveys</i> (1st ed.). <a href="/wiki/CRC_Press" title="CRC Press">CRC Press</a>. p. 24. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-20318604-6" title="Special:BookSources/978-0-20318604-6"><bdi>978-0-20318604-6</bdi></a><span class="reference-accessdate">. Retrieved <span class="nowrap">2019-08-18</span></span>. <q>[…] Systems such as the [Digital Field System] DFS IV and DFS V were quaternary floating-point systems and used gain steps of 12 dB. […]</q></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Chapter+2+-+High+resolution+digital+site+survey+systems+-+Chapter+2.1+-+Digital+field+recording+systems&rft.btitle=High+Resolution+Site+Surveys&rft.pages=24&rft.edition=1st&rft.pub=CRC+Press&rft.date=2000-12-07&rft.isbn=978-0-20318604-6&rft.aulast=Parkinson&rft.aufirst=Roger&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DOcip5vpLD4wC%26pg%3DPA24&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (256 pages)</span> </li> <li id="cite_note-Lazarus_1956-14"><span class="mw-cite-backlink"><b><a href="#cite_ref-Lazarus_1956_14-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFLazarus1957" class="citation web cs1">Lazarus, Roger B. (1957-01-30) [1956-10-01]. <a rel="nofollow" class="external text" href="http://bitsavers.org/pdf/lanl/LA-2083_MANIAC_II_Oct56.pdf">"MANIAC II"</a> <span class="cs1-format">(PDF)</span>. Los Alamos, NM, USA: Los Alamos Scientific Laboratory of the University of California. p. 14. LA-2083. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180807200914/http://bitsavers.org/pdf/lanl/LA-2083_MANIAC_II_Oct56.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2018-08-07<span class="reference-accessdate">. Retrieved <span class="nowrap">2018-08-07</span></span>. <q>[…] the Maniac's floating base, which is 2<sup>16</sup> = 65,536. […] The Maniac's large base permits a considerable increase in the speed of floating point arithmetic. Although such a large base implies the possibility of as many as 15 lead zeros, the large word size of 48 bits guarantees adequate significance. […]</q></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=MANIAC+II&rft.place=Los+Alamos%2C+NM%2C+USA&rft.pages=14&rft.pub=Los+Alamos+Scientific+Laboratory+of+the+University+of+California&rft.date=1957-01-30&rft.aulast=Lazarus&rft.aufirst=Roger+B.&rft_id=http%3A%2F%2Fbitsavers.org%2Fpdf%2Flanl%2FLA-2083_MANIAC_II_Oct56.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-16"><span class="mw-cite-backlink"><b><a href="#cite_ref-16">^</a></b></span> <span class="reference-text">Torres Quevedo, Leonardo. <a rel="nofollow" class="external text" href="https://quickclick.es/rop/pdf/publico/1914/1914_tomoI_2043_01.pdf">Automática: Complemento de la Teoría de las Máquinas, (pdf)</a>, pp. 575–583, Revista de Obras Públicas, 19 November 1914.</span> </li> <li id="cite_note-17"><span class="mw-cite-backlink"><b><a href="#cite_ref-17">^</a></b></span> <span class="reference-text">Ronald T. Kneusel. <i><a rel="nofollow" class="external text" href="https://books.google.com/books?id=eq4ZDgAAQBAJ&dq=leonardo+torres+quevedo++electromechanical+machine+essays&pg=PA84">Numbers and Computers</a>,</i> Springer, pp. 84–85, 2017. <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3319505084" title="Special:BookSources/978-3319505084">978-3319505084</a></span> </li> <li id="cite_note-FOOTNOTERandell19826,_11–13-18"><span class="mw-cite-backlink"><b><a href="#cite_ref-FOOTNOTERandell19826,_11–13_18-0">^</a></b></span> <span class="reference-text"><a href="#CITEREFRandell1982">Randell 1982</a>, pp. 6, 11–13.</span> </li> <li id="cite_note-19"><span class="mw-cite-backlink"><b><a href="#cite_ref-19">^</a></b></span> <span class="reference-text">Randell, Brian. <a rel="nofollow" class="external text" href="https://dl.acm.org/doi/pdf/10.5555/1074100.1074334">Digital Computers, History of Origins, (pdf)</a>, p. 545, Digital Computers: Origins, Encyclopedia of Computer Science, January 2003.</span> </li> <li id="cite_note-Rojas_1997-20"><span class="mw-cite-backlink"><b><a href="#cite_ref-Rojas_1997_20-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFRojas1997" class="citation journal cs1"><a href="/wiki/Ra%C3%BAl_Rojas" title="Raúl Rojas">Rojas, Raúl</a> (April–June 1997). <a rel="nofollow" class="external text" href="http://ed-thelen.org/comp-hist/Zuse_Z1_and_Z3.pdf">"Konrad Zuse's Legacy: The Architecture of the Z1 and Z3"</a> <span class="cs1-format">(PDF)</span>. <i><a href="/wiki/IEEE_Annals_of_the_History_of_Computing" title="IEEE Annals of the History of Computing">IEEE Annals of the History of Computing</a></i>. <b>19</b> (2): <span class="nowrap">5–</span>16. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2F85.586067">10.1109/85.586067</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20220703082408/http://ed-thelen.org/comp-hist/Zuse_Z1_and_Z3.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2022-07-03<span class="reference-accessdate">. Retrieved <span class="nowrap">2022-07-03</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=IEEE+Annals+of+the+History+of+Computing&rft.atitle=Konrad+Zuse%27s+Legacy%3A+The+Architecture+of+the+Z1+and+Z3&rft.volume=19&rft.issue=2&rft.pages=%3Cspan+class%3D%22nowrap%22%3E5-%3C%2Fspan%3E16&rft.date=1997-04%2F1997-06&rft_id=info%3Adoi%2F10.1109%2F85.586067&rft.aulast=Rojas&rft.aufirst=Ra%C3%BAl&rft_id=http%3A%2F%2Fed-thelen.org%2Fcomp-hist%2FZuse_Z1_and_Z3.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (12 pages)</span> </li> <li id="cite_note-Rojas_2014-21"><span class="mw-cite-backlink"><b><a href="#cite_ref-Rojas_2014_21-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFRojas2014" class="citation arxiv cs1"><a href="/wiki/Ra%C3%BAl_Rojas" title="Raúl Rojas">Rojas, Raúl</a> (2014-06-07). "The Z1: Architecture and Algorithms of Konrad Zuse's First Computer". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/1406.1886">1406.1886</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.AR">cs.AR</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=The+Z1%3A+Architecture+and+Algorithms+of+Konrad+Zuse%27s+First+Computer&rft.date=2014-06-07&rft_id=info%3Aarxiv%2F1406.1886&rft.aulast=Rojas&rft.aufirst=Ra%C3%BAl&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kahan_1997_JVNL-22"><span class="mw-cite-backlink">^ <a href="#cite_ref-Kahan_1997_JVNL_22-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Kahan_1997_JVNL_22-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahan1997" class="citation web cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a> (1997-07-15). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/SIAMjvnl.pdf">"The Baleful Effect of Computer Languages and Benchmarks upon Applied Mathematics, Physics and Chemistry. John von Neumann Lecture"</a> <span class="cs1-format">(PDF)</span>. p. 3. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20080905103125/http://www.cs.berkeley.edu/~wkahan/SIAMjvnl.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2008-09-05.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=The+Baleful+Effect+of+Computer+Languages+and+Benchmarks+upon+Applied+Mathematics%2C+Physics+and+Chemistry.+John+von+Neumann+Lecture&rft.pages=3&rft.date=1997-07-15&rft.aulast=Kahan&rft.aufirst=William+Morton&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2FSIAMjvnl.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Randell_1982_2-23"><span class="mw-cite-backlink"><b><a href="#cite_ref-Randell_1982_2_23-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFRandell1982" class="citation book cs1"><a href="/wiki/Brian_Randell" title="Brian Randell">Randell, Brian</a>, ed. (1982) [1973]. <i>The Origins of Digital Computers: Selected Papers</i> (3rd ed.). Berlin; New York: <a href="/wiki/Springer-Verlag" class="mw-redirect" title="Springer-Verlag">Springer-Verlag</a>. p. 244. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-540-11319-5" title="Special:BookSources/978-3-540-11319-5"><bdi>978-3-540-11319-5</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=The+Origins+of+Digital+Computers%3A+Selected+Papers&rft.place=Berlin%3B+New+York&rft.pages=244&rft.edition=3rd&rft.pub=Springer-Verlag&rft.date=1982&rft.isbn=978-3-540-11319-5&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Severance_1998-24"><span class="mw-cite-backlink"><b><a href="#cite_ref-Severance_1998_24-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSeverance1998" class="citation web cs1"><a href="/wiki/Charles_Severance_(computer_scientist)" title="Charles Severance (computer scientist)">Severance, Charles</a> (1998-02-20). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/ieee754status/754story.html">"An Interview with the Old Man of Floating-Point"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=An+Interview+with+the+Old+Man+of+Floating-Point&rft.date=1998-02-20&rft.aulast=Severance&rft.aufirst=Charles&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2Fieee754status%2F754story.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-C99-25"><span class="mw-cite-backlink"><b><a href="#cite_ref-C99_25-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation book cs1"><i>ISO/IEC 9899:1999 - Programming languages - C</i>. Iso.org. §F.2, note 307. <q><span class="cs1-kern-left"></span>"Extended" is IEC 60559's double-extended data format. Extended refers to both the common 80-bit and quadruple 128-bit IEC 60559 formats.</q></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=ISO%2FIEC+9899%3A1999+-+Programming+languages+-+C&rft.pages=%C2%A7F.2%2C+note+307&rft.pub=Iso.org&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-MSVC-26"><span class="mw-cite-backlink"><b><a href="#cite_ref-MSVC_26-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://learn.microsoft.com/en-us/cpp/build/ieee-floating-point-representation">"IEEE Floating-Point Representation"</a>. 2021-08-03.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=IEEE+Floating-Point+Representation&rft.date=2021-08-03&rft_id=https%3A%2F%2Flearn.microsoft.com%2Fen-us%2Fcpp%2Fbuild%2Fieee-floating-point-representation&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-GCC-27"><span class="mw-cite-backlink"><b><a href="#cite_ref-GCC_27-0">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://gcc.gnu.org/onlinedocs/gcc/i386-and-x86-64-Options.html">Using the GNU Compiler Collection, i386 and x86-64 Options</a> <a rel="nofollow" class="external text" href="https://web.archive.org/web/20150116065447/http://gcc.gnu.org/onlinedocs/gcc/i386-and-x86-64-Options.html">Archived</a> 2015-01-16 at the <a href="/wiki/Wayback_Machine" title="Wayback Machine">Wayback Machine</a>.</span> </li> <li id="cite_note-float_128-28"><span class="mw-cite-backlink"><b><a href="#cite_ref-float_128_28-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://stackoverflow.com/questions/13516476">"long double (GCC specific) and __float128"</a>. <i>StackOverflow</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=StackOverflow&rft.atitle=long+double+%28GCC+specific%29+and+__float128&rft_id=https%3A%2F%2Fstackoverflow.com%2Fquestions%2F13516476&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-ARM_2013_AArch64-29"><span class="mw-cite-backlink"><b><a href="#cite_ref-ARM_2013_AArch64_29-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf">"Procedure Call Standard for the ARM 64-bit Architecture (AArch64)"</a> <span class="cs1-format">(PDF)</span>. 2013-05-22. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20130731181404/http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2013-07-31<span class="reference-accessdate">. Retrieved <span class="nowrap">2019-09-22</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Procedure+Call+Standard+for+the+ARM+64-bit+Architecture+%28AArch64%29&rft.date=2013-05-22&rft_id=http%3A%2F%2Finfocenter.arm.com%2Fhelp%2Ftopic%2Fcom.arm.doc.ihi0055b%2FIHI0055B_aapcs64.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-ARM_2013_Compiler-30"><span class="mw-cite-backlink"><b><a href="#cite_ref-ARM_2013_Compiler_30-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://infocenter.arm.com/help/topic/com.arm.doc.dui0491i/DUI0491I_arm_compiler_reference.pdf">"ARM Compiler toolchain Compiler Reference, Version 5.03"</a> <span class="cs1-format">(PDF)</span>. 2013. Section 6.3 <i>Basic data types</i>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20150627210618/http://infocenter.arm.com/help/topic/com.arm.doc.dui0491i/DUI0491I_arm_compiler_reference.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2015-06-27<span class="reference-accessdate">. Retrieved <span class="nowrap">2019-11-08</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=ARM+Compiler+toolchain+Compiler+Reference%2C+Version+5.03&rft.pages=Section+6.3+%27%27Basic+data+types%27%27&rft.date=2013&rft_id=http%3A%2F%2Finfocenter.arm.com%2Fhelp%2Ftopic%2Fcom.arm.doc.dui0491i%2FDUI0491I_arm_compiler_reference.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kahan_2004-31"><span class="mw-cite-backlink"><b><a href="#cite_ref-Kahan_2004_31-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahan2004" class="citation web cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a> (2004-11-20). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/Qdrtcs.pdf">"On the Cost of Floating-Point Computation Without Extra-Precise Arithmetic"</a> <span class="cs1-format">(PDF)</span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20060525111157/http://www.cs.berkeley.edu/~wkahan/Qdrtcs.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2006-05-25<span class="reference-accessdate">. Retrieved <span class="nowrap">2012-02-19</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=On+the+Cost+of+Floating-Point+Computation+Without+Extra-Precise+Arithmetic&rft.date=2004-11-20&rft.aulast=Kahan&rft.aufirst=William+Morton&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2FQdrtcs.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-OpenEXR-32"><span class="mw-cite-backlink"><b><a href="#cite_ref-OpenEXR_32-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://web.archive.org/web/20130508221152/http://www.openexr.com/about.html">"openEXR"</a>. openEXR. Archived from <a rel="nofollow" class="external text" href="http://www.openexr.com/about.html">the original</a> on 2013-05-08<span class="reference-accessdate">. Retrieved <span class="nowrap">2012-04-25</span></span>. <q>Since the IEEE-754 floating-point specification does not define a 16-bit format, ILM created the "half" format. Half values have 1 sign bit, 5 exponent bits, and 10 mantissa bits.</q></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=openEXR&rft.pub=openEXR&rft_id=http%3A%2F%2Fwww.openexr.com%2Fabout.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-OpenEXR-half-33"><span class="mw-cite-backlink"><b><a href="#cite_ref-OpenEXR-half_33-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://openexr.com/en/latest/TechnicalIntroduction.html#the-half-data-type">"Technical Introduction to OpenEXR – The half Data Type"</a>. openEXR<span class="reference-accessdate">. Retrieved <span class="nowrap">2024-04-16</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Technical+Introduction+to+OpenEXR+%E2%80%93+The+half+Data+Type&rft.pub=openEXR&rft_id=https%3A%2F%2Fopenexr.com%2Fen%2Flatest%2FTechnicalIntroduction.html%23the-half-data-type&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-IEEE-754_Analysis-34"><span class="mw-cite-backlink"><b><a href="#cite_ref-IEEE-754_Analysis_34-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://christophervickery.com/IEEE-754/">"IEEE-754 Analysis"</a><span class="reference-accessdate">. Retrieved <span class="nowrap">2024-08-29</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=IEEE-754+Analysis&rft_id=https%3A%2F%2Fchristophervickery.com%2FIEEE-754%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Borland_1994_MBF-35"><span class="mw-cite-backlink">^ <a href="#cite_ref-Borland_1994_MBF_35-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Borland_1994_MBF_35-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBorland_staff1998" class="citation web cs1">Borland staff (1998-07-02) [1994-03-10]. <a rel="nofollow" class="external text" href="https://community.embarcadero.com/index.php/article/technical-articles/162-programming/14799-converting-between-microsoft-binary-and-ieee-forma">"Converting between Microsoft Binary and IEEE formats"</a>. <i>Technical Information Database</i> (TI1431C.txt). <a href="/wiki/Embarcadero_USA" class="mw-redirect" title="Embarcadero USA">Embarcadero USA</a> / <a href="/wiki/Inprise" class="mw-redirect" title="Inprise">Inprise</a> (originally: <a href="/wiki/Borland" title="Borland">Borland</a>). ID 1400. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20190220230417/https://community.embarcadero.com/index.php/article/technical-articles/162-programming/14799-converting-between-microsoft-binary-and-ieee-forma">Archived</a> from the original on 2019-02-20<span class="reference-accessdate">. Retrieved <span class="nowrap">2016-05-30</span></span>. <q>[…] _fmsbintoieee(float *src4, float *dest4) […] MS Binary Format […] byte order => m3 | m2 | m1 | exponent […] m1 is <a href="/wiki/Most_significant_byte" class="mw-redirect" title="Most significant byte">most significant byte</a> => sbbb|bbbb […] m3 is the <a href="/wiki/Least_significant_byte" class="mw-redirect" title="Least significant byte">least significant byte</a> […] m = mantissa byte […] s = sign bit […] b = bit […] MBF is bias 128 and IEEE is bias 127. […] MBF places the <a href="/wiki/Decimal_point" class="mw-redirect" title="Decimal point">decimal point</a> before the <a href="/wiki/Assumed_bit" class="mw-redirect" title="Assumed bit">assumed bit</a>, while IEEE places the decimal point after the assumed bit. […] ieee_exp = msbin[3] - 2; /* actually, msbin[3]-1-128+127 */ […] _dmsbintoieee(double *src8, double *dest8) […] MS Binary Format […] byte order => m7 | m6 | m5 | m4 | m3 | m2 | m1 | exponent […] m1 is most significant byte => smmm|mmmm […] m7 is the least significant byte […] MBF is bias 128 and IEEE is bias 1023. […] MBF places the decimal point before the assumed bit, while IEEE places the decimal point after the assumed bit. […] ieee_exp = msbin[7] - 128 - 1 + 1023; […]</q></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Technical+Information+Database&rft.atitle=Converting+between+Microsoft+Binary+and+IEEE+formats&rft.date=1998-07-02&rft.au=Borland+staff&rft_id=https%3A%2F%2Fcommunity.embarcadero.com%2Findex.php%2Farticle%2Ftechnical-articles%2F162-programming%2F14799-converting-between-microsoft-binary-and-ieee-forma&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Steil_2008_6502-36"><span class="mw-cite-backlink">^ <a href="#cite_ref-Steil_2008_6502_36-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Steil_2008_6502_36-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSteil2008" class="citation web cs1">Steil, Michael (2008-10-20). <a rel="nofollow" class="external text" href="http://www.pagetable.com/?p=46">"Create your own Version of Microsoft BASIC for 6502"</a>. pagetable.com. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20160530092603/http://www.pagetable.com/?p=46">Archived</a> from the original on 2016-05-30<span class="reference-accessdate">. Retrieved <span class="nowrap">2016-05-30</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Create+your+own+Version+of+Microsoft+BASIC+for+6502&rft.pub=pagetable.com&rft.date=2008-10-20&rft.aulast=Steil&rft.aufirst=Michael&rft_id=http%3A%2F%2Fwww.pagetable.com%2F%3Fp%3D46&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Microsoft_2006_KB35826-37"><span class="mw-cite-backlink"><b><a href="#cite_ref-Microsoft_2006_KB35826_37-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://www.betaarchive.com/wiki/index.php/Microsoft_KB_Archive/35826#IEEE_vs._Microsoft_Binary_Format.3B_Rounding_Issues_.28Complete.29">"IEEE vs. Microsoft Binary Format; Rounding Issues (Complete)"</a>. <i>Microsoft Support</i>. <a href="/wiki/Microsoft" title="Microsoft">Microsoft</a>. 2006-11-21. Article ID KB35826, Q35826. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20200828130651/https://www.betaarchive.com/wiki/index.php/Microsoft_KB_Archive/35826">Archived</a> from the original on 2020-08-28<span class="reference-accessdate">. Retrieved <span class="nowrap">2010-02-24</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=Microsoft+Support&rft.atitle=IEEE+vs.+Microsoft+Binary+Format%3B+Rounding+Issues+%28Complete%29&rft.date=2006-11-21&rft_id=https%3A%2F%2Fwww.betaarchive.com%2Fwiki%2Findex.php%2FMicrosoft_KB_Archive%2F35826%23IEEE_vs._Microsoft_Binary_Format.3B_Rounding_Issues_.28Complete.29&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kharya_2020-38"><span class="mw-cite-backlink">^ <a href="#cite_ref-Kharya_2020_38-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Kharya_2020_38-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKharya2020" class="citation web cs1">Kharya, Paresh (2020-05-14). <a rel="nofollow" class="external text" href="https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/">"TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x"</a><span class="reference-accessdate">. Retrieved <span class="nowrap">2020-05-16</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=TensorFloat-32+in+the+A100+GPU+Accelerates+AI+Training%2C+HPC+up+to+20x&rft.date=2020-05-14&rft.aulast=Kharya&rft.aufirst=Paresh&rft_id=https%3A%2F%2Fblogs.nvidia.com%2Fblog%2F2020%2F05%2F14%2Ftensorfloat-32-precision-format%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-NVIDIA_Hopper-39"><span class="mw-cite-backlink"><b><a href="#cite_ref-NVIDIA_Hopper_39-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://developer.nvidia.com/blog/nvidia-hopper-architecture-in-depth/">"NVIDIA Hopper Architecture In-Depth"</a>. 2022-03-22.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=NVIDIA+Hopper+Architecture+In-Depth&rft.date=2022-03-22&rft_id=https%3A%2F%2Fdeveloper.nvidia.com%2Fblog%2Fnvidia-hopper-architecture-in-depth%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Micikevicius_2022-40"><span class="mw-cite-backlink"><b><a href="#cite_ref-Micikevicius_2022_40-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMicikeviciusStosicBurgessCornea2022" class="citation arxiv cs1">Micikevicius, Paulius; Stosic, Dusan; Burgess, Neil; Cornea, Marius; Dubey, Pradeep; Grisenthwaite, Richard; Ha, Sangwon; Heinecke, Alexander; Judd, Patrick; Kamalu, John; Mellempudi, Naveen; Oberman, Stuart; Shoeybi, Mohammad; Siu, Michael; Wu, Hao (2022-09-12). "FP8 Formats for Deep Learning". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2209.05433">2209.05433</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.LG">cs.LG</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=FP8+Formats+for+Deep+Learning&rft.date=2022-09-12&rft_id=info%3Aarxiv%2F2209.05433&rft.aulast=Micikevicius&rft.aufirst=Paulius&rft.au=Stosic%2C+Dusan&rft.au=Burgess%2C+Neil&rft.au=Cornea%2C+Marius&rft.au=Dubey%2C+Pradeep&rft.au=Grisenthwaite%2C+Richard&rft.au=Ha%2C+Sangwon&rft.au=Heinecke%2C+Alexander&rft.au=Judd%2C+Patrick&rft.au=Kamalu%2C+John&rft.au=Mellempudi%2C+Naveen&rft.au=Oberman%2C+Stuart&rft.au=Shoeybi%2C+Mohammad&rft.au=Siu%2C+Michael&rft.au=Wu%2C+Hao&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kahan_2006_Mindless-42"><span class="mw-cite-backlink"><b><a href="#cite_ref-Kahan_2006_Mindless_42-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahan2006" class="citation web cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a> (2006-01-11). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/Mindless.pdf">"How Futile are Mindless Assessments of Roundoff in Floating-Point Computation?"</a> <span class="cs1-format">(PDF)</span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20041221020332/http://www.cs.berkeley.edu/~wkahan/Mindless.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2004-12-21.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=How+Futile+are+Mindless+Assessments+of+Roundoff+in+Floating-Point+Computation%3F&rft.date=2006-01-11&rft.aulast=Kahan&rft.aufirst=William+Morton&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2FMindless.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Gay_1990-43"><span class="mw-cite-backlink">^ <a href="#cite_ref-Gay_1990_43-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Gay_1990_43-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGay1990" class="citation techreport cs1">Gay, David M. (1990). <i>Correctly Rounded Binary-Decimal and Decimal-Binary Conversions</i> (Technical report). NUMERICAL ANALYSIS MANUSCRIPT 90-10, AT&T BELL LABORATORIES. <a href="/wiki/CiteSeerX_(identifier)" class="mw-redirect" title="CiteSeerX (identifier)">CiteSeerX</a> <span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.31.4049">10.1.1.31.4049</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=report&rft.btitle=Correctly+Rounded+Binary-Decimal+and+Decimal-Binary+Conversions&rft.pub=NUMERICAL+ANALYSIS+MANUSCRIPT+90-10%2C+AT%26T+BELL+LABORATORIES&rft.date=1990&rft_id=https%3A%2F%2Fciteseerx.ist.psu.edu%2Fviewdoc%2Fsummary%3Fdoi%3D10.1.1.31.4049%23id-name%3DCiteSeerX&rft.aulast=Gay&rft.aufirst=David+M.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (<a rel="nofollow" class="external text" href="http://www.netlib.org/fp/dtoa.c">dtoa.c in netlab</a>)</span> </li> <li id="cite_note-Loitsch_2010-44"><span class="mw-cite-backlink"><b><a href="#cite_ref-Loitsch_2010_44-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFLoitsch2010" class="citation conference cs1">Loitsch, Florian (2010). <a rel="nofollow" class="external text" href="https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf">"Printing floating-point numbers quickly and accurately with integers"</a> <span class="cs1-format">(PDF)</span>. <i>Proceedings of the 31st ACM SIGPLAN Conference on Programming Language Design and Implementation</i>. PLDI '10: ACM SIGPLAN Conference on Programming Language Design and Implementation. pp. <span class="nowrap">233–</span>243. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F1806596.1806623">10.1145/1806596.1806623</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-45030019-3" title="Special:BookSources/978-1-45030019-3"><bdi>978-1-45030019-3</bdi></a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:910409">910409</a>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20140729005717/http://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2014-07-29.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.atitle=Printing+floating-point+numbers+quickly+and+accurately+with+integers&rft.btitle=Proceedings+of+the+31st+ACM+SIGPLAN+Conference+on+Programming+Language+Design+and+Implementation&rft.pages=%3Cspan+class%3D%22nowrap%22%3E233-%3C%2Fspan%3E243&rft.date=2010&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A910409%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1145%2F1806596.1806623&rft.isbn=978-1-45030019-3&rft.aulast=Loitsch&rft.aufirst=Florian&rft_id=https%3A%2F%2Fwww.cs.tufts.edu%2F~nr%2Fcs257%2Farchive%2Fflorian-loitsch%2Fprintf.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-mazong-45"><span class="mw-cite-backlink"><b><a href="#cite_ref-mazong_45-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/dotnet/coreclr/pull/14646">"Added Grisu3 algorithm support for double.ToString(). by mazong1123 · Pull Request #14646 · dotnet/coreclr"</a>. <i>GitHub</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=Added+Grisu3+algorithm+support+for+double.ToString%28%29.+by+mazong1123+%C2%B7+Pull+Request+%2314646+%C2%B7+dotnet%2Fcoreclr&rft_id=https%3A%2F%2Fgithub.com%2Fdotnet%2Fcoreclr%2Fpull%2F14646&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Adams_2018-46"><span class="mw-cite-backlink"><b><a href="#cite_ref-Adams_2018_46-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFAdams2018" class="citation journal cs1">Adams, Ulf (2018-12-02). <a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F3296979.3192369">"Ryū: fast float-to-string conversion"</a>. <i>ACM SIGPLAN Notices</i>. <b>53</b> (4): <span class="nowrap">270–</span>282. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F3296979.3192369">10.1145/3296979.3192369</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:218472153">218472153</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ACM+SIGPLAN+Notices&rft.atitle=Ry%C5%AB%3A+fast+float-to-string+conversion&rft.volume=53&rft.issue=4&rft.pages=%3Cspan+class%3D%22nowrap%22%3E270-%3C%2Fspan%3E282&rft.date=2018-12-02&rft_id=info%3Adoi%2F10.1145%2F3296979.3192369&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A218472153%23id-name%3DS2CID&rft.aulast=Adams&rft.aufirst=Ulf&rft_id=https%3A%2F%2Fdoi.org%2F10.1145%252F3296979.3192369&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Giulietti-47"><span class="mw-cite-backlink"><b><a href="#cite_ref-Giulietti_47-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGiulietti" class="citation web cs1">Giulietti, Rafaello. <a rel="nofollow" class="external text" href="https://drive.google.com/file/d/1IEeATSVnEE6TkrHlCYNY2GjaraBjOT4f">"The Schubfach way to render doubles"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=The+Schubfach+way+to+render+doubles&rft.aulast=Giulietti&rft.aufirst=Rafaello&rft_id=https%3A%2F%2Fdrive.google.com%2Ffile%2Fd%2F1IEeATSVnEE6TkrHlCYNY2GjaraBjOT4f&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-abolz-48"><span class="mw-cite-backlink"><b><a href="#cite_ref-abolz_48-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/abolz/Drachennest">"abolz/Drachennest"</a>. <i><a href="/wiki/GitHub" title="GitHub">GitHub</a></i>. 2022-11-10.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=abolz%2FDrachennest&rft.date=2022-11-10&rft_id=https%3A%2F%2Fgithub.com%2Fabolz%2FDrachennest&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-double_conversion_2020-49"><span class="mw-cite-backlink"><b><a href="#cite_ref-double_conversion_2020_49-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/google/double-conversion">"google/double-conversion"</a>. <i><a href="/wiki/GitHub" title="GitHub">GitHub</a></i>. 2020-09-21.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=google%2Fdouble-conversion&rft.date=2020-09-21&rft_id=https%3A%2F%2Fgithub.com%2Fgoogle%2Fdouble-conversion&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Lemire_2021-50"><span class="mw-cite-backlink"><b><a href="#cite_ref-Lemire_2021_50-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFLemire2021" class="citation journal cs1">Lemire, Daniel (2021-03-22). "Number parsing at a gigabyte per second". <i>Software: Practice and Experience</i>. <b>51</b> (8): <span class="nowrap">1700–</span>1727. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2101.11408">2101.11408</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1002%2Fspe.2984">10.1002/spe.2984</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:231718830">231718830</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Software%3A+Practice+and+Experience&rft.atitle=Number+parsing+at+a+gigabyte+per+second&rft.volume=51&rft.issue=8&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1700-%3C%2Fspan%3E1727&rft.date=2021-03-22&rft_id=info%3Aarxiv%2F2101.11408&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A231718830%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1002%2Fspe.2984&rft.aulast=Lemire&rft.aufirst=Daniel&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Goldberg_1991-51"><span class="mw-cite-backlink">^ <a href="#cite_ref-Goldberg_1991_51-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Goldberg_1991_51-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Goldberg_1991_51-2"><sup><i><b>c</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGoldberg1991" class="citation journal cs1"><a href="/w/index.php?title=David_Goldberg_(PARC)&action=edit&redlink=1" class="new" title="David Goldberg (PARC) (page does not exist)">Goldberg, David</a> (March 1991). <a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F103162.103163">"What Every Computer Scientist Should Know About Floating-Point Arithmetic"</a>. <i><a href="/wiki/ACM_Computing_Surveys" title="ACM Computing Surveys">ACM Computing Surveys</a></i>. <b>23</b> (1): <span class="nowrap">5–</span>48. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F103162.103163">10.1145/103162.103163</a></span>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:222008826">222008826</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ACM+Computing+Surveys&rft.atitle=What+Every+Computer+Scientist+Should+Know+About+Floating-Point+Arithmetic&rft.volume=23&rft.issue=1&rft.pages=%3Cspan+class%3D%22nowrap%22%3E5-%3C%2Fspan%3E48&rft.date=1991-03&rft_id=info%3Adoi%2F10.1145%2F103162.103163&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A222008826%23id-name%3DS2CID&rft.aulast=Goldberg&rft.aufirst=David&rft_id=https%3A%2F%2Fdoi.org%2F10.1145%252F103162.103163&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (With the addendum "Differences Among IEEE 754 Implementations": <a rel="nofollow" class="external autonumber" href="https://web.archive.org/web/20171011072644/http://www.cse.msu.edu/~cse320/Documents/FloatingPoint.pdf">[2]</a>, <a rel="nofollow" class="external autonumber" href="https://docs.oracle.com/cd/E19957-01/806-3568/ncg_goldberg.html">[3]</a>)</span> </li> <li id="cite_note-Patterson-Hennessy_2014-52"><span class="mw-cite-backlink"><b><a href="#cite_ref-Patterson-Hennessy_2014_52-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFPattersonHennessy2014" class="citation book cs1">Patterson, David A.; Hennessy, John L. (2014). <i>Computer Organization and Design, The Hardware/Software Interface</i>. The Morgan Kaufmann series in computer architecture and design (5th ed.). Waltham, Massachusetts, USA: Elsevier. p. 793. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-9-86605267-5" title="Special:BookSources/978-9-86605267-5"><bdi>978-9-86605267-5</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Computer+Organization+and+Design%2C+The+Hardware%2FSoftware+Interface&rft.place=Waltham%2C+Massachusetts%2C+USA&rft.series=The+Morgan+Kaufmann+series+in+computer+architecture+and+design&rft.pages=793&rft.edition=5th&rft.pub=Elsevier&rft.date=2014&rft.isbn=978-9-86605267-5&rft.aulast=Patterson&rft.aufirst=David+A.&rft.au=Hennessy%2C+John+L.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Sierra_1962-53"><span class="mw-cite-backlink">^ <a href="#cite_ref-Sierra_1962_53-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Sierra_1962_53-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1041539562">.mw-parser-output .citation{word-wrap:break-word}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}</style><span class="citation patent" id="harv"><a rel="nofollow" class="external text" href="https://worldwide.espacenet.com/textdoc?DB=EPODOC&IDX=US3037701A">US patent 3037701A</a>, Huberto M Sierra, "Floating decimal point arithmetic control means for calculator", issued 1962-06-05</span><span class="Z3988" title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Apatent&rft.number=3037701A&rft.cc=US&rft.title=Floating+decimal+point+arithmetic+control+means+for+calculator&rft.inventor=Huberto+M+Sierra&rft.date=1962-06-05"><span style="display: none;"> </span></span></span> </li> <li id="cite_note-Kahan_1997_Status-55"><span class="mw-cite-backlink">^ <a href="#cite_ref-Kahan_1997_Status_55-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Kahan_1997_Status_55-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahan1997" class="citation web cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a> (1997-10-01). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/ieee754status/IEEE754.PDF">"Lecture Notes on the Status of IEEE Standard 754 for Binary Floating-Point Arithmetic"</a> <span class="cs1-format">(PDF)</span>. p. 9. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20020622093102/http://www.cs.berkeley.edu/~wkahan/ieee754status/IEEE754.PDF">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2002-06-22.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Lecture+Notes+on+the+Status+of+IEEE+Standard+754+for+Binary+Floating-Point+Arithmetic&rft.pages=9&rft.date=1997-10-01&rft.aulast=Kahan&rft.aufirst=William+Morton&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2Fieee754status%2FIEEE754.PDF&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Intel-56"><span class="mw-cite-backlink"><b><a href="#cite_ref-Intel_56-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation book cs1"><a rel="nofollow" class="external text" href="http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">"D.3.2.1"</a>. <i>Intel 64 and IA-32 Architectures Software Developers' Manuals</i>. Vol. 1.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=D.3.2.1&rft.btitle=Intel+64+and+IA-32+Architectures+Software+Developers%27+Manuals&rft_id=http%3A%2F%2Fwww.intel.com%2Fcontent%2Fwww%2Fus%2Fen%2Fprocessors%2Farchitectures-software-developer-manuals.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Harris-58"><span class="mw-cite-backlink"><b><a href="#cite_ref-Harris_58-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHarris2010" class="citation journal cs1">Harris, Richard (October 2010). <a rel="nofollow" class="external text" href="http://accu.org/index.php/journals/1702">"You're Going To Have To Think!"</a>. <i><a href="/wiki/Overload_(magazine)" title="Overload (magazine)">Overload</a></i> (99): <span class="nowrap">5–</span>10. <a href="/wiki/ISSN_(identifier)" class="mw-redirect" title="ISSN (identifier)">ISSN</a> <a rel="nofollow" class="external text" href="https://search.worldcat.org/issn/1354-3172">1354-3172</a><span class="reference-accessdate">. Retrieved <span class="nowrap">2011-09-24</span></span>. <q>Far more worrying is cancellation error which can yield catastrophic loss of precision.</q></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Overload&rft.atitle=You%27re+Going+To+Have+To+Think%21&rft.issue=99&rft.pages=%3Cspan+class%3D%22nowrap%22%3E5-%3C%2Fspan%3E10&rft.date=2010-10&rft.issn=1354-3172&rft.aulast=Harris&rft.aufirst=Richard&rft_id=http%3A%2F%2Faccu.org%2Findex.php%2Fjournals%2F1702&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> <a rel="nofollow" class="external autonumber" href="http://accu.org/var/uploads/journals/overload99.pdf">[4]</a></span> </li> <li id="cite_note-Barker-59"><span class="mw-cite-backlink"><b><a href="#cite_ref-Barker_59-0">^</a></b></span> <span class="reference-text">Christopher Barker: <a rel="nofollow" class="external text" href="https://www.python.org/dev/peps/pep-0485/"><i>PEP 485 -- A Function for testing approximate equality</i></a></span> </li> <li id="cite_note-GAO_report_IMTEC_92-26-60"><span class="mw-cite-backlink"><b><a href="#cite_ref-GAO_report_IMTEC_92-26_60-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.gao.gov/products/IMTEC-92-26">"Patriot missile defense, Software problem led to system failure at Dharhan, Saudi Arabia"</a>. <a href="/wiki/US_Government_Accounting_Office" class="mw-redirect" title="US Government Accounting Office">US Government Accounting Office</a>. GAO report IMTEC 92-26.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Patriot+missile+defense%2C+Software+problem+led+to+system+failure+at+Dharhan%2C+Saudi+Arabia&rft.pub=US+Government+Accounting+Office&rft_id=http%3A%2F%2Fwww.gao.gov%2Fproducts%2FIMTEC-92-26&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Skeel-61"><span class="mw-cite-backlink"><b><a href="#cite_ref-Skeel_61-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSkeel1992" class="citation cs2">Skeel, Robert (July 1992), <a rel="nofollow" class="external text" href="https://www-users.cse.umn.edu/~arnold/disasters/Patriot-dharan-skeel-siam.pdf">"Roundoff Error and the Patriot Missile"</a> <span class="cs1-format">(PDF)</span>, <i>SIAM News</i>, <b>25</b> (4): 11<span class="reference-accessdate">, retrieved <span class="nowrap">2024-11-15</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=SIAM+News&rft.atitle=Roundoff+Error+and+the+Patriot+Missile&rft.volume=25&rft.issue=4&rft.pages=11&rft.date=1992-07&rft.aulast=Skeel&rft.aufirst=Robert&rft_id=https%3A%2F%2Fwww-users.cse.umn.edu%2F~arnold%2Fdisasters%2FPatriot-dharan-skeel-siam.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-RalstonReilly2003-62"><span class="mw-cite-backlink"><b><a href="#cite_ref-RalstonReilly2003_62-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFWilkinson2003" class="citation book cs1"><a href="/wiki/James_Hardy_Wilkinson" class="mw-redirect" title="James Hardy Wilkinson">Wilkinson, James Hardy</a> (2003-09-08). "Error Analysis". In Ralston, Anthony; Reilly, Edwin D.; Hemmendinger, David (eds.). <a rel="nofollow" class="external text" href="https://books.google.com/books?id=OLRwQgAACAAJ"><i>Encyclopedia of Computer Science</i></a>. <a href="/wiki/Wiley_(publisher)" title="Wiley (publisher)">Wiley</a>. pp. <span class="nowrap">669–</span>674. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-470-86412-8" title="Special:BookSources/978-0-470-86412-8"><bdi>978-0-470-86412-8</bdi></a><span class="reference-accessdate">. Retrieved <span class="nowrap">2013-05-14</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Error+Analysis&rft.btitle=Encyclopedia+of+Computer+Science&rft.pages=%3Cspan+class%3D%22nowrap%22%3E669-%3C%2Fspan%3E674&rft.pub=Wiley&rft.date=2003-09-08&rft.isbn=978-0-470-86412-8&rft.aulast=Wilkinson&rft.aufirst=James+Hardy&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DOLRwQgAACAAJ&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Einarsson_2005-63"><span class="mw-cite-backlink"><b><a href="#cite_ref-Einarsson_2005_63-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFEinarsson2005" class="citation book cs1">Einarsson, Bo (2005). <a rel="nofollow" class="external text" href="https://books.google.com/books?id=sh4orx_qB_QC&pg=PA50"><i>Accuracy and reliability in scientific computing</i></a>. <a href="/wiki/Society_for_Industrial_and_Applied_Mathematics" title="Society for Industrial and Applied Mathematics">Society for Industrial and Applied Mathematics</a> (SIAM). pp. 50–. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-89871-815-7" title="Special:BookSources/978-0-89871-815-7"><bdi>978-0-89871-815-7</bdi></a><span class="reference-accessdate">. Retrieved <span class="nowrap">2013-05-14</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Accuracy+and+reliability+in+scientific+computing&rft.pages=50-&rft.pub=Society+for+Industrial+and+Applied+Mathematics+%28SIAM%29&rft.date=2005&rft.isbn=978-0-89871-815-7&rft.aulast=Einarsson&rft.aufirst=Bo&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3Dsh4orx_qB_QC%26pg%3DPA50&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Higham_2002-64"><span class="mw-cite-backlink">^ <a href="#cite_ref-Higham_2002_64-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Higham_2002_64-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Higham_2002_64-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-Higham_2002_64-3"><sup><i><b>d</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFHigham2002" class="citation book cs1"><a href="/wiki/Nicholas_Higham" title="Nicholas Higham">Higham, Nicholas John</a> (2002). <a rel="nofollow" class="external text" href="https://books.google.com/books?id=epilvM5MMxwC"><i>Accuracy and Stability of Numerical Algorithms</i></a> (2nd ed.). <a href="/wiki/Society_for_Industrial_and_Applied_Mathematics" title="Society for Industrial and Applied Mathematics">Society for Industrial and Applied Mathematics</a> (SIAM). pp. <span class="nowrap">27–</span>28, <span class="nowrap">110–</span>123, 493. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-89871-521-7" title="Special:BookSources/978-0-89871-521-7"><bdi>978-0-89871-521-7</bdi></a>. 0-89871-355-2.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Accuracy+and+Stability+of+Numerical+Algorithms&rft.pages=%3Cspan+class%3D%22nowrap%22%3E27-%3C%2Fspan%3E28%2C+%3Cspan+class%3D%22nowrap%22%3E110-%3C%2Fspan%3E123%2C+493&rft.edition=2nd&rft.pub=Society+for+Industrial+and+Applied+Mathematics+%28SIAM%29&rft.date=2002&rft.isbn=978-0-89871-521-7&rft.aulast=Higham&rft.aufirst=Nicholas+John&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DepilvM5MMxwC&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-OliveiraStewart_2006-65"><span class="mw-cite-backlink"><b><a href="#cite_ref-OliveiraStewart_2006_65-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFOliveiraStewart2006" class="citation book cs1">Oliveira, Suely; Stewart, David E. (2006-09-07). <a rel="nofollow" class="external text" href="https://books.google.com/books?id=E6a8oZOS8noC&pg=PA10"><i>Writing Scientific Software: A Guide to Good Style</i></a>. <a href="/wiki/Cambridge_University_Press" title="Cambridge University Press">Cambridge University Press</a>. pp. 10–. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-1-139-45862-7" title="Special:BookSources/978-1-139-45862-7"><bdi>978-1-139-45862-7</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Writing+Scientific+Software%3A+A+Guide+to+Good+Style&rft.pages=10-&rft.pub=Cambridge+University+Press&rft.date=2006-09-07&rft.isbn=978-1-139-45862-7&rft.aulast=Oliveira&rft.aufirst=Suely&rft.au=Stewart%2C+David+E.&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DE6a8oZOS8noC%26pg%3DPA10&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kahan_2005_ARITH17-66"><span class="mw-cite-backlink">^ <a href="#cite_ref-Kahan_2005_ARITH17_66-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Kahan_2005_ARITH17_66-1"><sup><i><b>b</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahan2005" class="citation conference cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a> (2005-07-15). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/ARITH_17.pdf"><i>Floating-Point Arithmetic Besieged by "Business Decisions"</i></a> <span class="cs1-format">(PDF)</span>. IEEE-sponsored <a href="/wiki/ARITH_17" class="mw-redirect" title="ARITH 17">ARITH 17</a>, Symposium on Computer Arithmetic (Keynote Address). pp. 6, 18. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20060317103619/http://www.cs.berkeley.edu/~wkahan/ARITH_17.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2006-03-17<span class="reference-accessdate">. Retrieved <span class="nowrap">2013-05-23</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Floating-Point+Arithmetic+Besieged+by+%22Business+Decisions%22&rft.pages=6%2C+18&rft.date=2005-07-15&rft.aulast=Kahan&rft.aufirst=William+Morton&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2FARITH_17.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (NB. Kahan estimates that the incidence of excessively inaccurate results near singularities is reduced by a factor of approx. 1/2000 using the 11 extra bits of precision of <a href="/wiki/Extended_precision" title="Extended precision">double extended</a>.)</span> </li> <li id="cite_note-Kahan_2011_Debug-67"><span class="mw-cite-backlink"><b><a href="#cite_ref-Kahan_2011_Debug_67-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahan2011" class="citation conference cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a> (2011-08-03). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/Boulder.pdf"><i>Desperately Needed Remedies for the Undebuggability of Large Floating-Point Computations in Science and Engineering</i></a> <span class="cs1-format">(PDF)</span>. IFIP/SIAM/NIST Working Conference on Uncertainty Quantification in Scientific Computing, Boulder, CO. p. 33. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20130620140729/http://www.eecs.berkeley.edu/~wkahan/Boulder.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2013-06-20.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Desperately+Needed+Remedies+for+the+Undebuggability+of+Large+Floating-Point+Computations+in+Science+and+Engineering&rft.pages=33&rft.date=2011-08-03&rft.aulast=Kahan&rft.aufirst=William+Morton&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2FBoulder.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kahan_2001_JavaHurt-70"><span class="mw-cite-backlink"><b><a href="#cite_ref-Kahan_2001_JavaHurt_70-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahanDarcy2001" class="citation web cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a>; Darcy, Joseph (2001) [1998-03-01]. <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/JAVAhurt.pdf">"How Java's floating-point hurts everyone everywhere"</a> <span class="cs1-format">(PDF)</span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20000816043653/http://www.cs.berkeley.edu/~wkahan/JAVAhurt.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2000-08-16<span class="reference-accessdate">. Retrieved <span class="nowrap">2003-09-05</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=How+Java%27s+floating-point+hurts+everyone+everywhere&rft.date=2001&rft.aulast=Kahan&rft.aufirst=William+Morton&rft.au=Darcy%2C+Joseph&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2FJAVAhurt.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kahan_2000_Marketing-72"><span class="mw-cite-backlink">^ <a href="#cite_ref-Kahan_2000_Marketing_72-0"><sup><i><b>a</b></i></sup></a> <a href="#cite_ref-Kahan_2000_Marketing_72-1"><sup><i><b>b</b></i></sup></a> <a href="#cite_ref-Kahan_2000_Marketing_72-2"><sup><i><b>c</b></i></sup></a> <a href="#cite_ref-Kahan_2000_Marketing_72-3"><sup><i><b>d</b></i></sup></a></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahan2000" class="citation web cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a> (2000-08-27). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/MktgMath.pdf">"Marketing versus Mathematics"</a> <span class="cs1-format">(PDF)</span>. pp. 15, 35, 47. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20030815150333/http://www.cs.berkeley.edu/~wkahan/MktgMath.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2003-08-15.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Marketing+versus+Mathematics&rft.pages=15%2C+35%2C+47&rft.date=2000-08-27&rft.aulast=Kahan&rft.aufirst=William+Morton&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2FMktgMath.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kahan_1981_WhyIEEE-73"><span class="mw-cite-backlink"><b><a href="#cite_ref-Kahan_1981_WhyIEEE_73-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahan1981" class="citation web cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a> (1981-02-12). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/ieee754status/why-ieee.pdf">"Why do we need a floating-point arithmetic standard?"</a> <span class="cs1-format">(PDF)</span>. p. 26. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20041204070746/http://www.cs.berkeley.edu/~wkahan/ieee754status/why-ieee.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2004-12-04.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Why+do+we+need+a+floating-point+arithmetic+standard%3F&rft.pages=26&rft.date=1981-02-12&rft.aulast=Kahan&rft.aufirst=William+Morton&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2Fieee754status%2Fwhy-ieee.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kahan_2001_LN-74"><span class="mw-cite-backlink"><b><a href="#cite_ref-Kahan_2001_LN_74-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahan2001" class="citation web cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a> (2001-06-04). Bindel, David (ed.). <a rel="nofollow" class="external text" href="http://www.cims.nyu.edu/~dbindel/class/cs279/notes-06-04.pdf">"Lecture notes of System Support for Scientific Computation"</a> <span class="cs1-format">(PDF)</span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20130517181356/http://www.cims.nyu.edu/~dbindel/class/cs279/notes-06-04.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2013-05-17.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Lecture+notes+of+System+Support+for+Scientific+Computation&rft.date=2001-06-04&rft.aulast=Kahan&rft.aufirst=William+Morton&rft_id=http%3A%2F%2Fwww.cims.nyu.edu%2F~dbindel%2Fclass%2Fcs279%2Fnotes-06-04.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Speleotrove_2012-75"><span class="mw-cite-backlink"><b><a href="#cite_ref-Speleotrove_2012_75-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://speleotrove.com/decimal/">"General Decimal Arithmetic"</a>. Speleotrove.com<span class="reference-accessdate">. Retrieved <span class="nowrap">2012-04-25</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=General+Decimal+Arithmetic&rft.pub=Speleotrove.com&rft_id=https%3A%2F%2Fspeleotrove.com%2Fdecimal%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Christiansen_Perl-76"><span class="mw-cite-backlink"><b><a href="#cite_ref-Christiansen_Perl_76-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFChristiansenTorkington2006" class="citation web cs1">Christiansen, Tom; Torkington, Nathan; et al. (2006). <a rel="nofollow" class="external text" href="https://perldoc.perl.org/5.8.8/perlfaq4#Why-is-int()-broken?">"perlfaq4 / Why is int() broken?"</a>. perldoc.perl.org<span class="reference-accessdate">. Retrieved <span class="nowrap">2011-01-11</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=perlfaq4+%2F+Why+is+int%28%29+broken%3F&rft.pub=perldoc.perl.org&rft.date=2006&rft.aulast=Christiansen&rft.aufirst=Tom&rft.au=Torkington%2C+Nathan&rft_id=https%3A%2F%2Fperldoc.perl.org%2F5.8.8%2Fperlfaq4%23Why-is-int%28%29-broken%3F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Shewchuk-77"><span class="mw-cite-backlink"><b><a href="#cite_ref-Shewchuk_77-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFShewchuk1997" class="citation journal cs1">Shewchuk, Jonathan Richard (1997). <a rel="nofollow" class="external text" href="https://doi.org/10.1007%2FPL00009321">"Adaptive Precision Floating-Point Arithmetic and Fast Robust Geometric Predicates"</a>. <i><a href="/wiki/Discrete_%26_Computational_Geometry" title="Discrete & Computational Geometry">Discrete & Computational Geometry</a></i>. <b>18</b> (3): <span class="nowrap">305–</span>363. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1007%2FPL00009321">10.1007/PL00009321</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=Discrete+%26+Computational+Geometry&rft.atitle=Adaptive+Precision+Floating-Point+Arithmetic+and+Fast+Robust+Geometric+Predicates&rft.volume=18&rft.issue=3&rft.pages=%3Cspan+class%3D%22nowrap%22%3E305-%3C%2Fspan%3E363&rft.date=1997&rft_id=info%3Adoi%2F10.1007%2FPL00009321&rft.aulast=Shewchuk&rft.aufirst=Jonathan+Richard&rft_id=https%3A%2F%2Fdoi.org%2F10.1007%252FPL00009321&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Kahan_1997_Cantilever-78"><span class="mw-cite-backlink"><b><a href="#cite_ref-Kahan_1997_Cantilever_78-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKahanIvory1997" class="citation web cs1"><a href="/wiki/William_Morton_Kahan" class="mw-redirect" title="William Morton Kahan">Kahan, William Morton</a>; Ivory, Melody Y. (1997-07-03). <a rel="nofollow" class="external text" href="https://people.eecs.berkeley.edu/~wkahan/Cantilever.pdf">"Roundoff Degrades an Idealized Cantilever"</a> <span class="cs1-format">(PDF)</span>. <a rel="nofollow" class="external text" href="https://web.archive.org/web/20031205191038/http://www.cs.berkeley.edu/~wkahan/Cantilever.pdf">Archived</a> <span class="cs1-format">(PDF)</span> from the original on 2003-12-05.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Roundoff+Degrades+an+Idealized+Cantilever&rft.date=1997-07-03&rft.aulast=Kahan&rft.aufirst=William+Morton&rft.au=Ivory%2C+Melody+Y.&rft_id=https%3A%2F%2Fpeople.eecs.berkeley.edu%2F~wkahan%2FCantilever.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Vectorizers-80"><span class="mw-cite-backlink"><b><a href="#cite_ref-Vectorizers_80-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://llvm.org/docs/Vectorizers.html">"Auto-Vectorization in LLVM"</a>. <i>LLVM 13 documentation</i>. <q>We support floating point reduction operations when -ffast-math is used.</q></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=LLVM+13+documentation&rft.atitle=Auto-Vectorization+in+LLVM&rft_id=https%3A%2F%2Fllvm.org%2Fdocs%2FVectorizers.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-FPM-81"><span class="mw-cite-backlink"><b><a href="#cite_ref-FPM_81-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://gcc.gnu.org/wiki/FloatingPointMath">"FloatingPointMath"</a>. <i>GCC Wiki</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GCC+Wiki&rft.atitle=FloatingPointMath&rft_id=https%3A%2F%2Fgcc.gnu.org%2Fwiki%2FFloatingPointMath&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-harmful-82"><span class="mw-cite-backlink"><b><a href="#cite_ref-harmful_82-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55522">"55522 – -funsafe-math-optimizations is unexpectedly harmful, especially w/ -shared"</a>. <i>gcc.gnu.org</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=gcc.gnu.org&rft.atitle=55522+%E2%80%93+-funsafe-math-optimizations+is+unexpectedly+harmful%2C+especially+w%2F+-shared&rft_id=https%3A%2F%2Fgcc.gnu.org%2Fbugzilla%2Fshow_bug.cgi%3Fid%3D55522&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Gen-83"><span class="mw-cite-backlink"><b><a href="#cite_ref-Gen_83-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://gcc.gnu.org/onlinedocs/gfortran/Code-Gen-Options.html">"Code Gen Options (The GNU Fortran Compiler)"</a>. <i>gcc.gnu.org</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=gcc.gnu.org&rft.atitle=Code+Gen+Options+%28The+GNU+Fortran+Compiler%29&rft_id=https%3A%2F%2Fgcc.gnu.org%2Fonlinedocs%2Fgfortran%2FCode-Gen-Options.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-zheevd-84"><span class="mw-cite-backlink"><b><a href="#cite_ref-zheevd_84-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/Reference-LAPACK/lapack/issues/43">"Bug in zheevd · Issue #43 · Reference-LAPACK/lapack"</a>. <i>GitHub</i>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=Bug+in+zheevd+%C2%B7+Issue+%2343+%C2%B7+Reference-LAPACK%2Flapack&rft_id=https%3A%2F%2Fgithub.com%2FReference-LAPACK%2Flapack%2Fissues%2F43&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> <li id="cite_note-Becker-Darulova-Myreen-Tatlock_2019-85"><span class="mw-cite-backlink"><b><a href="#cite_ref-Becker-Darulova-Myreen-Tatlock_2019_85-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBeckerDarulovaMyreenTatlock2019" class="citation conference cs1">Becker, Heiko; Darulova, Eva; Myreen, Magnus O.; Tatlock, Zachary (2019). <i>Icing: Supporting Fast-Math Style Optimizations in a Verified Compiler</i>. CAV 2019: Computer Aided Verification. Vol. 11562. pp. <span class="nowrap">155–</span>173. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-3-030-25543-5_10">10.1007/978-3-030-25543-5_10</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=Icing%3A+Supporting+Fast-Math+Style+Optimizations+in+a+Verified+Compiler&rft.pages=%3Cspan+class%3D%22nowrap%22%3E155-%3C%2Fspan%3E173&rft.date=2019&rft_id=info%3Adoi%2F10.1007%2F978-3-030-25543-5_10&rft.aulast=Becker&rft.aufirst=Heiko&rft.au=Darulova%2C+Eva&rft.au=Myreen%2C+Magnus+O.&rft.au=Tatlock%2C+Zachary&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></span> </li> </ol></div> <div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=26" title="Edit section: Further reading"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFWilkinson1963" class="citation book cs1"><a href="/wiki/James_Hardy_Wilkinson" class="mw-redirect" title="James Hardy Wilkinson">Wilkinson, James Hardy</a> (1963). <a rel="nofollow" class="external text" href="https://books.google.com/books?id=yFogU9Ot-qsC"><i>Rounding Errors in Algebraic Processes</i></a> (1st ed.). Englewood Cliffs, New Jersey, USA: <a href="/wiki/Prentice-Hall,_Inc." class="mw-redirect" title="Prentice-Hall, Inc.">Prentice-Hall, Inc.</a> <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/9780486679990" title="Special:BookSources/9780486679990"><bdi>9780486679990</bdi></a>. <a href="/wiki/MR_(identifier)" class="mw-redirect" title="MR (identifier)">MR</a> <a rel="nofollow" class="external text" href="https://mathscinet.ams.org/mathscinet-getitem?mr=0161456">0161456</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Rounding+Errors+in+Algebraic+Processes&rft.place=Englewood+Cliffs%2C+New+Jersey%2C+USA&rft.edition=1st&rft.pub=Prentice-Hall%2C+Inc.&rft.date=1963&rft.isbn=9780486679990&rft_id=https%3A%2F%2Fmathscinet.ams.org%2Fmathscinet-getitem%3Fmr%3D161456%23id-name%3DMR&rft.aulast=Wilkinson&rft.aufirst=James+Hardy&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DyFogU9Ot-qsC&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (NB. Classic influential treatises on floating-point arithmetic.)</li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFWilkinson1965" class="citation book cs1"><a href="/wiki/James_Hardy_Wilkinson" class="mw-redirect" title="James Hardy Wilkinson">Wilkinson, James Hardy</a> (1965). <a rel="nofollow" class="external text" href="https://books.google.com/books?id=N98IAQAAIAAJ&q=editions:ISBN0198534183"><i>The Algebraic Eigenvalue Problem</i></a>. Monographs on Numerical Analysis (1st ed.). <a href="/wiki/Oxford_University_Press" title="Oxford University Press">Oxford University Press</a> / <a href="/wiki/Clarendon_Press" class="mw-redirect" title="Clarendon Press">Clarendon Press</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/9780198534037" title="Special:BookSources/9780198534037"><bdi>9780198534037</bdi></a><span class="reference-accessdate">. Retrieved <span class="nowrap">2016-02-11</span></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=The+Algebraic+Eigenvalue+Problem&rft.series=Monographs+on+Numerical+Analysis&rft.edition=1st&rft.pub=Oxford+University+Press+%2F+Clarendon+Press&rft.date=1965&rft.isbn=9780198534037&rft.aulast=Wilkinson&rft.aufirst=James+Hardy&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3DN98IAQAAIAAJ%26q%3Deditions%3AISBN0198534183&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSterbenz1974" class="citation book cs1">Sterbenz, Pat H. (1974). <i>Floating-Point Computation</i>. Prentice-Hall Series in Automatic Computation (1st ed.). Englewood Cliffs, New Jersey, USA: <a href="/wiki/Prentice_Hall" title="Prentice Hall">Prentice Hall</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-13-322495-5" title="Special:BookSources/978-0-13-322495-5"><bdi>978-0-13-322495-5</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Floating-Point+Computation&rft.place=Englewood+Cliffs%2C+New+Jersey%2C+USA&rft.series=Prentice-Hall+Series+in+Automatic+Computation&rft.edition=1st&rft.pub=Prentice+Hall&rft.date=1974&rft.isbn=978-0-13-322495-5&rft.aulast=Sterbenz&rft.aufirst=Pat+H.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFGolubvan_Loan1986" class="citation book cs1">Golub, Gene F.; van Loan, Charles F. (1986). <i>Matrix Computations</i> (3rd ed.). <a href="/wiki/Johns_Hopkins_University_Press" title="Johns Hopkins University Press">Johns Hopkins University Press</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-8018-5413-2" title="Special:BookSources/978-0-8018-5413-2"><bdi>978-0-8018-5413-2</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Matrix+Computations&rft.edition=3rd&rft.pub=Johns+Hopkins+University+Press&rft.date=1986&rft.isbn=978-0-8018-5413-2&rft.aulast=Golub&rft.aufirst=Gene+F.&rft.au=van+Loan%2C+Charles+F.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFPressTeukolskyVetterlingFlannery2007" class="citation book cs1"><a href="/wiki/William_Henry_Press" class="mw-redirect" title="William Henry Press">Press, William Henry</a>; <a href="/wiki/Saul_A._Teukolsky" class="mw-redirect" title="Saul A. Teukolsky">Teukolsky, Saul A.</a>; <a href="/wiki/William_T._Vetterling" class="mw-redirect" title="William T. Vetterling">Vetterling, William T.</a>; <a href="/wiki/Brian_P._Flannery" title="Brian P. Flannery">Flannery, Brian P.</a> (2007) [1986]. <a href="/wiki/Numerical_Recipes" title="Numerical Recipes"><i>Numerical Recipes - The Art of Scientific Computing</i></a> (3rd ed.). <a href="/wiki/Cambridge_University_Press" title="Cambridge University Press">Cambridge University Press</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-521-88407-5" title="Special:BookSources/978-0-521-88407-5"><bdi>978-0-521-88407-5</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Numerical+Recipes+-+The+Art+of+Scientific+Computing&rft.edition=3rd&rft.pub=Cambridge+University+Press&rft.date=2007&rft.isbn=978-0-521-88407-5&rft.aulast=Press&rft.aufirst=William+Henry&rft.au=Teukolsky%2C+Saul+A.&rft.au=Vetterling%2C+William+T.&rft.au=Flannery%2C+Brian+P.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (NB. Edition with source code CD-ROM.)</li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKnuth1997" class="citation book cs1"><a href="/wiki/Donald_Ervin_Knuth" class="mw-redirect" title="Donald Ervin Knuth">Knuth, Donald Ervin</a> (1997). "Section 4.2: Floating-Point Arithmetic". <i><a href="/wiki/The_Art_of_Computer_Programming" title="The Art of Computer Programming">The Art of Computer Programming</a></i>, Vol. 2: <i>Seminumerical Algorithms</i> (3rd ed.). <a href="/wiki/Addison-Wesley" title="Addison-Wesley">Addison-Wesley</a>. pp. <span class="nowrap">214–</span>264. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-201-89684-8" title="Special:BookSources/978-0-201-89684-8"><bdi>978-0-201-89684-8</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=Section+4.2%3A+Floating-Point+Arithmetic&rft.btitle=The+Art+of+Computer+Programming%2C+Vol.+2%3A+Seminumerical+Algorithms&rft.pages=%3Cspan+class%3D%22nowrap%22%3E214-%3C%2Fspan%3E264&rft.edition=3rd&rft.pub=Addison-Wesley&rft.date=1997&rft.isbn=978-0-201-89684-8&rft.aulast=Knuth&rft.aufirst=Donald+Ervin&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFBlaauwBrooks,_Jr.1997" class="citation book cs1"><a href="/wiki/Gerrit_Anne_Blaauw" class="mw-redirect" title="Gerrit Anne Blaauw">Blaauw, Gerrit Anne</a>; <a href="/wiki/Frederick_Phillips_Brooks,_Jr." class="mw-redirect" title="Frederick Phillips Brooks, Jr.">Brooks, Jr., Frederick Phillips</a> (1997). <i>Computer Architecture: Concepts and Evolution</i> (1st ed.). <a href="/wiki/Addison-Wesley" title="Addison-Wesley">Addison-Wesley</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/0-201-10557-8" title="Special:BookSources/0-201-10557-8"><bdi>0-201-10557-8</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Computer+Architecture%3A+Concepts+and+Evolution&rft.edition=1st&rft.pub=Addison-Wesley&rft.date=1997&rft.isbn=0-201-10557-8&rft.aulast=Blaauw&rft.aufirst=Gerrit+Anne&rft.au=Brooks%2C+Jr.%2C+Frederick+Phillips&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (1213 pages) (NB. This is a single-volume edition. This work was also available in a two-volume version.)</li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFKornerupMatula2010" class="citation book cs1">Kornerup, Peter; Matula, David W. (2010). <i>Finite Precision Number Systems and Arithmetic</i>. <a href="/wiki/Cambridge_University_Press" title="Cambridge University Press">Cambridge University Press</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-0-521-76135-2" title="Special:BookSources/978-0-521-76135-2"><bdi>978-0-521-76135-2</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Finite+Precision+Number+Systems+and+Arithmetic&rft.pub=Cambridge+University+Press&rft.date=2010&rft.isbn=978-0-521-76135-2&rft.aulast=Kornerup&rft.aufirst=Peter&rft.au=Matula%2C+David+W.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFSavard2018" class="citation cs2">Savard, John J. G. (2018) [2005], <a rel="nofollow" class="external text" href="http://www.quadibloc.com/comp/cp0201.htm">"Floating-Point Formats"</a>, <i>quadibloc</i>, <a rel="nofollow" class="external text" href="https://web.archive.org/web/20180703001709/http://www.quadibloc.com/comp/cp0201.htm">archived</a> from the original on 2018-07-03<span class="reference-accessdate">, retrieved <span class="nowrap">2018-07-16</span></span></cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=quadibloc&rft.atitle=Floating-Point+Formats&rft.date=2018&rft.aulast=Savard&rft.aufirst=John+J.+G.&rft_id=http%3A%2F%2Fwww.quadibloc.com%2Fcomp%2Fcp0201.htm&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMullerBruniede_DinechinJeannerod2018" class="citation book cs1">Muller, Jean-Michel; Brunie, Nicolas; de Dinechin, Florent; Jeannerod, Claude-Pierre; Joldes, Mioara; Lefèvre, Vincent; Melquiond, Guillaume; <a href="/wiki/Nathalie_Revol" title="Nathalie Revol">Revol, Nathalie</a>; Torres, Serge (2018) [2010]. <a rel="nofollow" class="external text" href="https://books.google.com/books?id=h3ZZDwAAQBAJ"><i>Handbook of Floating-Point Arithmetic</i></a> (2nd ed.). <a href="/wiki/Birkh%C3%A4user" title="Birkhäuser">Birkhäuser</a>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-3-319-76526-6">10.1007/978-3-319-76526-6</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-319-76525-9" title="Special:BookSources/978-3-319-76525-9"><bdi>978-3-319-76525-9</bdi></a>. <a href="/wiki/LCCN_(identifier)" class="mw-redirect" title="LCCN (identifier)">LCCN</a> <a rel="nofollow" class="external text" href="https://lccn.loc.gov/2018935254">2018935254</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Handbook+of+Floating-Point+Arithmetic&rft.edition=2nd&rft.pub=Birkh%C3%A4user&rft.date=2018&rft_id=info%3Alccn%2F2018935254&rft_id=info%3Adoi%2F10.1007%2F978-3-319-76526-6&rft.isbn=978-3-319-76525-9&rft.aulast=Muller&rft.aufirst=Jean-Michel&rft.au=Brunie%2C+Nicolas&rft.au=de+Dinechin%2C+Florent&rft.au=Jeannerod%2C+Claude-Pierre&rft.au=Joldes%2C+Mioara&rft.au=Lef%C3%A8vre%2C+Vincent&rft.au=Melquiond%2C+Guillaume&rft.au=Revol%2C+Nathalie&rft.au=Torres%2C+Serge&rft_id=https%3A%2F%2Fbooks.google.com%2Fbooks%3Fid%3Dh3ZZDwAAQBAJ&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></li></ul> <div class="mw-heading mw-heading2"><h2 id="External_links">External links</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Floating-point_arithmetic&action=edit&section=27" title="Edit section: External links"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://www.mrob.com/pub/math/floatformats.html">"Survey of Floating-Point Formats"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Survey+of+Floating-Point+Formats&rft_id=http%3A%2F%2Fwww.mrob.com%2Fpub%2Fmath%2Ffloatformats.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (NB. This page gives a very brief summary of floating-point formats that have been used over the years.)</li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFMonniaux2008" class="citation journal cs1">Monniaux, David (May 2008). <a rel="nofollow" class="external text" href="https://hal.science/hal-00128124/en/">"The pitfalls of verifying floating-point computations"</a>. <i>ACM Transactions on Programming Languages and Systems</i>. <b>30</b> (3). <a href="/wiki/Association_for_Computing_Machinery" title="Association for Computing Machinery">Association for Computing Machinery</a> (ACM) Transactions on programming languages and systems (TOPLAS): <span class="nowrap">1–</span>41. <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/cs/0701192">cs/0701192</a></span>. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1145%2F1353445.1353446">10.1145/1353445.1353446</a>. <a href="/wiki/S2CID_(identifier)" class="mw-redirect" title="S2CID (identifier)">S2CID</a> <a rel="nofollow" class="external text" href="https://api.semanticscholar.org/CorpusID:218578808">218578808</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=ACM+Transactions+on+Programming+Languages+and+Systems&rft.atitle=The+pitfalls+of+verifying+floating-point+computations&rft.volume=30&rft.issue=3&rft.pages=%3Cspan+class%3D%22nowrap%22%3E1-%3C%2Fspan%3E41&rft.date=2008-05&rft_id=info%3Aarxiv%2Fcs%2F0701192&rft_id=https%3A%2F%2Fapi.semanticscholar.org%2FCorpusID%3A218578808%23id-name%3DS2CID&rft_id=info%3Adoi%2F10.1145%2F1353445.1353446&rft.aulast=Monniaux&rft.aufirst=David&rft_id=https%3A%2F%2Fhal.science%2Fhal-00128124%2Fen%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span> (NB. A compendium of non-intuitive behaviors of floating point on popular architectures, with implications for program verification and testing.)</li> <li><a rel="nofollow" class="external text" href="https://opencores.org/">OpenCores</a>. (NB. This website contains open source floating-point IP cores for the implementation of floating-point operators in FPGA or ASIC devices. The project <i>double_fpu</i> contains verilog source code of a double-precision floating-point unit. The project <i>fpuvhdl</i> contains vhdl source code of a single-precision floating-point unit.)</li> <li><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222" /><cite id="CITEREFFleegal2004" class="citation web cs1">Fleegal, Eric (2004). <a rel="nofollow" class="external text" href="https://web.archive.org/web/20170706020455/http://msdn.microsoft.com/en-us/library/aa289157(v=vs.71).aspx">"Microsoft Visual C++ Floating-Point Optimization"</a>. <a href="/wiki/Microsoft_Developer_Network" title="Microsoft Developer Network">Microsoft Developer Network</a>. Archived from <a rel="nofollow" class="external text" href="http://msdn.microsoft.com/en-us/library/aa289157(v=vs.71).aspx">the original</a> on 2017-07-06.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Microsoft+Visual+C%2B%2B+Floating-Point+Optimization&rft.pub=Microsoft+Developer+Network&rft.date=2004&rft.aulast=Fleegal&rft.aufirst=Eric&rft_id=http%3A%2F%2Fmsdn.microsoft.com%2Fen-us%2Flibrary%2Faa289157%28v%3Dvs.71%29.aspx&rfr_id=info%3Asid%2Fen.wikipedia.org%3AFloating-point+arithmetic" class="Z3988"></span></li></ul> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Data_types177" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374" /><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231" /><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Data_types" title="Template:Data types"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Data_types" title="Template talk:Data types"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Data_types" title="Special:EditPage/Template:Data types"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Data_types177" style="font-size:114%;margin:0 4em"><a href="/wiki/Data_type" title="Data type">Data types</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Units_of_information" title="Units of information">Uninterpreted</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Bit" title="Bit">Bit</a></li> <li><a href="/wiki/Byte" title="Byte">Byte</a></li> <li><a href="/wiki/Ternary_numeral_system" title="Ternary numeral system">Trit</a></li> <li><a href="/wiki/Ternary_numeral_system#Tryte" title="Ternary numeral system">Tryte</a></li> <li><a href="/wiki/Word_(computer_architecture)" title="Word (computer architecture)">Word</a></li> <li><a href="/wiki/Bit_array" title="Bit array">Bit array</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Numeric</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Arbitrary-precision_arithmetic" title="Arbitrary-precision arithmetic">Arbitrary-precision or bignum</a></li> <li><a href="/wiki/Complex_data_type" title="Complex data type">Complex</a></li> <li><a href="/wiki/Decimal_data_type" title="Decimal data type">Decimal</a></li> <li><a href="/wiki/Fixed-point_arithmetic" title="Fixed-point arithmetic">Fixed point</a></li> <li><a class="mw-selflink selflink">Floating point</a> <ul><li>Reduced precision <ul><li><a href="/wiki/Minifloat" title="Minifloat">Minifloat</a></li> <li><a href="/wiki/Half-precision_floating-point_format" title="Half-precision floating-point format">Half precision</a></li> <li><a href="/wiki/Bfloat16_floating-point_format" title="Bfloat16 floating-point format">bfloat16</a></li></ul></li> <li><a href="/wiki/Single-precision_floating-point_format" title="Single-precision floating-point format">Single precision</a></li> <li><a href="/wiki/Double-precision_floating-point_format" title="Double-precision floating-point format">Double precision</a></li> <li><a href="/wiki/Quadruple-precision_floating-point_format" title="Quadruple-precision floating-point format">Quadruple precision</a></li> <li><a href="/wiki/Octuple-precision_floating-point_format" title="Octuple-precision floating-point format">Octuple precision</a></li> <li><a href="/wiki/Extended_precision" title="Extended precision">Extended precision</a> <ul><li><a href="/wiki/Long_double" title="Long double">Long double</a></li></ul></li></ul></li> <li><a href="/wiki/Integer_(computer_science)" title="Integer (computer science)">Integer</a> <ul><li><a href="/wiki/Signedness" title="Signedness">signedness</a></li></ul></li> <li><a href="/wiki/Interval_arithmetic#Implementations" title="Interval arithmetic">Interval</a></li> <li><a href="/wiki/Rational_data_type" title="Rational data type">Rational</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Pointer_(computer_programming)" title="Pointer (computer programming)">Pointer</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Memory_address" title="Memory address">Address</a> <ul><li><a href="/wiki/Physical_address" title="Physical address">physical</a></li> <li><a href="/wiki/Virtual_address_space" title="Virtual address space">virtual</a></li></ul></li> <li><a href="/wiki/Reference_(computer_science)" title="Reference (computer science)">Reference</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Plain_text" title="Plain text">Text</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Character_(computing)" title="Character (computing)">Character</a></li> <li><a href="/wiki/String_(computer_science)" title="String (computer science)">String</a> <ul><li><a href="/wiki/Null-terminated_string" title="Null-terminated string">null-terminated</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Composite_data_type" title="Composite data type">Composite</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Algebraic_data_type" title="Algebraic data type">Algebraic data type</a> <ul><li><a href="/wiki/Generalized_algebraic_data_type" title="Generalized algebraic data type">generalized</a></li></ul></li> <li><a href="/wiki/Array_data_type" class="mw-redirect" title="Array data type">Array</a></li> <li><a href="/wiki/Associative_array" title="Associative array">Associative array</a></li> <li><a href="/wiki/Class_(computer_programming)" title="Class (computer programming)">Class</a></li> <li><a href="/wiki/Dependent_type" title="Dependent type">Dependent</a></li> <li><a href="/wiki/Intuitionistic_type_theory#Equality_type" title="Intuitionistic type theory">Equality</a></li> <li><a href="/wiki/Inductive_type" title="Inductive type">Inductive</a></li> <li><a href="/wiki/Intersection_type" title="Intersection type">Intersection</a></li> <li><a href="/wiki/List_(abstract_data_type)" title="List (abstract data type)">List</a></li> <li><a href="/wiki/Object_(computer_science)" title="Object (computer science)">Object</a> <ul><li><a href="/wiki/Metaobject" title="Metaobject">metaobject</a></li></ul></li> <li><a href="/wiki/Option_type" title="Option type">Option type</a></li> <li><a href="/wiki/Product_type" title="Product type">Product</a></li> <li><a href="/wiki/Record_(computer_science)" title="Record (computer science)">Record or Struct</a></li> <li><a href="/wiki/Refinement_type" title="Refinement type">Refinement</a></li> <li><a href="/wiki/Set_(abstract_data_type)" title="Set (abstract data type)">Set</a></li> <li><a href="/wiki/Union_type" title="Union type">Union</a> <ul><li><a href="/wiki/Tagged_union" title="Tagged union">tagged</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Other</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Boolean_data_type" title="Boolean data type">Boolean</a></li> <li><a href="/wiki/Bottom_type" title="Bottom type">Bottom type</a></li> <li><a href="/wiki/Container_(abstract_data_type)" title="Container (abstract data type)">Collection</a></li> <li><a href="/wiki/Enumerated_type" title="Enumerated type">Enumerated type</a></li> <li><a href="/wiki/Exception_handling" title="Exception handling">Exception</a></li> <li><a href="/wiki/Function_type" title="Function type">Function type</a></li> <li><a href="/wiki/Opaque_data_type" title="Opaque data type">Opaque data type</a></li> <li><a href="/wiki/Recursive_data_type" title="Recursive data type">Recursive data type</a></li> <li><a href="/wiki/Semaphore_(programming)" title="Semaphore (programming)">Semaphore</a></li> <li><a href="/wiki/Stream_(computing)" title="Stream (computing)">Stream</a></li> <li><a href="/wiki/Strongly_typed_identifier" title="Strongly typed identifier">Strongly typed identifier</a></li> <li><a href="/wiki/Top_type" title="Top type">Top type</a></li> <li><a href="/wiki/Type_class" title="Type class">Type class</a></li> <li><a href="/wiki/Empty_type" title="Empty type">Empty type</a></li> <li><a href="/wiki/Unit_type" title="Unit type">Unit type</a></li> <li><a href="/wiki/Void_type" title="Void type">Void</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Related<br />topics</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Abstract_data_type" title="Abstract data type">Abstract data type</a></li> <li><a href="/wiki/Boxing_(computer_science)" class="mw-redirect" title="Boxing (computer science)">Boxing</a></li> <li><a href="/wiki/Data_structure" title="Data structure">Data structure</a></li> <li><a href="/wiki/Generic_programming" title="Generic programming">Generic</a></li> <li><a href="/wiki/Kind_(type_theory)" title="Kind (type theory)">Kind</a> <ul><li><a href="/wiki/Metaclass" title="Metaclass">metaclass</a></li></ul></li> <li><a href="/wiki/Parametric_polymorphism" title="Parametric polymorphism">Parametric polymorphism</a></li> <li><a href="/wiki/Primitive_data_type" title="Primitive data type">Primitive data type</a></li> <li><a href="/wiki/Interface_(object-oriented_programming)" title="Interface (object-oriented programming)">Interface</a></li> <li><a href="/wiki/Subtyping" title="Subtyping">Subtyping</a></li> <li><a href="/wiki/Type_constructor" title="Type constructor">Type constructor</a></li> <li><a href="/wiki/Type_conversion" title="Type conversion">Type conversion</a></li> <li><a href="/wiki/Type_system" title="Type system">Type system</a></li> <li><a href="/wiki/Type_theory" title="Type theory">Type theory</a></li> <li><a href="/wiki/Variable_(computer_science)" title="Variable (computer science)">Variable</a></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐api‐ext.eqiad.main‐df448499c‐qfz9n Cached time: 20250318154823 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 1.359 seconds Real time usage: 1.829 seconds Preprocessor visited node count: 11597/1000000 Post‐expand include size: 221018/2097152 bytes Template argument size: 12496/2097152 bytes Highest expansion depth: 18/100 Expensive parser function count: 12/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 320391/5000000 bytes Lua time usage: 0.713/10.000 seconds Lua memory usage: 12464446/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 1374.413 1 -total 38.09% 523.458 2 Template:Reflist 13.53% 186.011 22 Template:Cite_book 10.49% 144.240 38 Template:Cite_web 7.27% 99.973 2 Template:Floating-point 7.04% 96.801 1 Template:Sfn 7.02% 96.505 1 Template:Sidebar 6.85% 94.081 1 Template:Short_description 4.50% 61.821 22 Template:Val 4.30% 59.162 2 Template:Pagetype --> <!-- Saved in parser cache with key enwiki:pcache:11376:|#|:idhash:canonical and timestamp 20250318154823 and revision id 1280068135. Rendering was triggered because: api-parse --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?useformat=desktop&type=1x1&usesul3=0" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Floating-point_arithmetic&oldid=1280068135">https://en.wikipedia.org/w/index.php?title=Floating-point_arithmetic&oldid=1280068135</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Floating_point" title="Category:Floating point">Floating point</a></li><li><a href="/wiki/Category:Computer_arithmetic" title="Category:Computer arithmetic">Computer arithmetic</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:CS1_German-language_sources_(de)" title="Category:CS1 German-language sources (de)">CS1 German-language sources (de)</a></li><li><a href="/wiki/Category:Webarchive_template_wayback_links" title="Category:Webarchive template wayback links">Webarchive template wayback links</a></li><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_is_different_from_Wikidata" title="Category:Short description is different from Wikidata">Short description is different from Wikidata</a></li><li><a href="/wiki/Category:Use_dmy_dates_from_May_2019" title="Category:Use dmy dates from May 2019">Use dmy dates from May 2019</a></li><li><a href="/wiki/Category:All_articles_with_unsourced_statements" title="Category:All articles with unsourced statements">All articles with unsourced statements</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_July_2020" title="Category:Articles with unsourced statements from July 2020">Articles with unsourced statements from July 2020</a></li><li><a href="/wiki/Category:Wikipedia_articles_needing_clarification_from_November_2024" title="Category:Wikipedia articles needing clarification from November 2024">Wikipedia articles needing clarification from November 2024</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_June_2016" title="Category:Articles with unsourced statements from June 2016">Articles with unsourced statements from June 2016</a></li><li><a href="/wiki/Category:Articles_with_example_C_code" title="Category:Articles with example C code">Articles with example C code</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 12 March 2025, at 08:46<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Floating-point_arithmetic&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><picture><source media="(min-width: 500px)" srcset="/static/images/footer/wikimedia-button.svg" width="84" height="29"><img src="/static/images/footer/wikimedia.svg" width="25" height="25" alt="Wikimedia Foundation" lang="en" loading="lazy"></picture></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><picture><source media="(min-width: 500px)" srcset="/w/resources/assets/poweredby_mediawiki.svg" width="88" height="31"><img src="/w/resources/assets/mediawiki_compact.svg" alt="Powered by MediaWiki" lang="en" width="25" height="25" loading="lazy"></picture></a></li> </ul> </footer> </div> </div> </div> <div class="vector-header-container vector-sticky-header-container"> <div id="vector-sticky-header" class="vector-sticky-header"> <div class="vector-sticky-header-start"> <div class="vector-sticky-header-icon-start vector-button-flush-left vector-button-flush-right" aria-hidden="true"> <button class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-sticky-header-search-toggle" tabindex="-1" data-event-name="ui.vector-sticky-search-form.icon"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </button> </div> <div role="search" class="vector-search-box-vue vector-search-box-show-thumbnail vector-search-box"> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail"> <form action="/w/index.php" id="vector-sticky-search-form" class="cdx-search-input cdx-search-input--has-end-button"> <div class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia"> <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <div class="vector-sticky-header-context-bar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-sticky-header-toc" class="vector-dropdown mw-portlet mw-portlet-sticky-header-toc vector-sticky-header-toc vector-button-flush-left" > <input type="checkbox" id="vector-sticky-header-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-sticky-header-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-sticky-header-toc-label" for="vector-sticky-header-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-sticky-header-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div class="vector-sticky-header-context-bar-primary" aria-hidden="true" ><span class="mw-page-title-main">Floating-point arithmetic</span></div> </div> </div> <div class="vector-sticky-header-end" aria-hidden="true"> <div class="vector-sticky-header-icons"> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-talk-sticky-header" tabindex="-1" data-event-name="talk-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbles mw-ui-icon-wikimedia-speechBubbles"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-subject-sticky-header" tabindex="-1" data-event-name="subject-sticky-header"><span class="vector-icon mw-ui-icon-article mw-ui-icon-wikimedia-article"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-history-sticky-header" tabindex="-1" data-event-name="history-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-history mw-ui-icon-wikimedia-wikimedia-history"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only mw-watchlink" id="ca-watchstar-sticky-header" tabindex="-1" data-event-name="watch-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-star mw-ui-icon-wikimedia-wikimedia-star"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-edit-sticky-header" tabindex="-1" data-event-name="wikitext-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-wikiText mw-ui-icon-wikimedia-wikimedia-wikiText"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-ve-edit-sticky-header" tabindex="-1" data-event-name="ve-edit-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-edit mw-ui-icon-wikimedia-wikimedia-edit"></span> <span></span> </a> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" id="ca-viewsource-sticky-header" tabindex="-1" data-event-name="ve-edit-protected-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-editLock mw-ui-icon-wikimedia-wikimedia-editLock"></span> <span></span> </a> </div> <div class="vector-sticky-header-buttons"> <button class="cdx-button cdx-button--weight-quiet mw-interlanguage-selector" id="p-lang-btn-sticky-header" tabindex="-1" data-event-name="ui.dropdown-p-lang-btn-sticky-header"><span class="vector-icon mw-ui-icon-wikimedia-language mw-ui-icon-wikimedia-wikimedia-language"></span> <span>44 languages</span> </button> <a href="#" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive" id="ca-addsection-sticky-header" tabindex="-1" data-event-name="addsection-sticky-header"><span class="vector-icon mw-ui-icon-speechBubbleAdd-progressive mw-ui-icon-wikimedia-speechBubbleAdd-progressive"></span> <span>Add topic</span> </a> </div> <div class="vector-sticky-header-icon-end"> <div class="vector-user-links"> </div> </div> </div> </div> </div> <div class="mw-portlet mw-portlet-dock-bottom emptyPortlet" id="p-dock-bottom"> <ul> </ul> </div> <script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.eqiad.main-754588485-5l7q5","wgBackendResponseTime":256,"wgPageParseReport":{"limitreport":{"cputime":"1.359","walltime":"1.829","ppvisitednodes":{"value":11597,"limit":1000000},"postexpandincludesize":{"value":221018,"limit":2097152},"templateargumentsize":{"value":12496,"limit":2097152},"expansiondepth":{"value":18,"limit":100},"expensivefunctioncount":{"value":12,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":320391,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 1374.413 1 -total"," 38.09% 523.458 2 Template:Reflist"," 13.53% 186.011 22 Template:Cite_book"," 10.49% 144.240 38 Template:Cite_web"," 7.27% 99.973 2 Template:Floating-point"," 7.04% 96.801 1 Template:Sfn"," 7.02% 96.505 1 Template:Sidebar"," 6.85% 94.081 1 Template:Short_description"," 4.50% 61.821 22 Template:Val"," 4.30% 59.162 2 Template:Pagetype"]},"scribunto":{"limitreport-timeusage":{"value":"0.713","limit":"10.000"},"limitreport-memusage":{"value":12464446,"limit":52428800},"limitreport-logs":"anchor_id_list = table#1 {\n [\"Base-256\"] = 1,\n [\"Base-4\"] = 1,\n [\"Base-65536\"] = 1,\n [\"Base-8\"] = 1,\n [\"CITEREFAdams2018\"] = 1,\n [\"CITEREFBeckerDarulovaMyreenTatlock2019\"] = 1,\n [\"CITEREFBeebe2017\"] = 1,\n [\"CITEREFBlaauwBrooks,_Jr.1997\"] = 1,\n [\"CITEREFBorland_staff1998\"] = 1,\n [\"CITEREFChristiansenTorkington2006\"] = 1,\n [\"CITEREFEinarsson2005\"] = 1,\n [\"CITEREFFleegal2004\"] = 1,\n [\"CITEREFGay1990\"] = 1,\n [\"CITEREFGiulietti\"] = 1,\n [\"CITEREFGoldberg1991\"] = 1,\n [\"CITEREFGolubvan_Loan1986\"] = 1,\n [\"CITEREFHarris2010\"] = 1,\n [\"CITEREFHigham2002\"] = 1,\n [\"CITEREFKahan1981\"] = 1,\n [\"CITEREFKahan1997\"] = 2,\n [\"CITEREFKahan2000\"] = 1,\n [\"CITEREFKahan2001\"] = 1,\n [\"CITEREFKahan2004\"] = 1,\n [\"CITEREFKahan2005\"] = 1,\n [\"CITEREFKahan2006\"] = 1,\n [\"CITEREFKahan2011\"] = 1,\n [\"CITEREFKahanDarcy2001\"] = 1,\n [\"CITEREFKahanIvory1997\"] = 1,\n [\"CITEREFKharya2020\"] = 1,\n [\"CITEREFKnuth1997\"] = 1,\n [\"CITEREFKornerupMatula2010\"] = 1,\n [\"CITEREFLazarus1957\"] = 1,\n [\"CITEREFLemire2021\"] = 1,\n [\"CITEREFLoitsch2010\"] = 1,\n [\"CITEREFMicikeviciusStosicBurgessCornea2022\"] = 1,\n [\"CITEREFMonniaux2008\"] = 1,\n [\"CITEREFMullerBruniede_DinechinJeannerod2018\"] = 1,\n [\"CITEREFOliveiraStewart2006\"] = 1,\n [\"CITEREFParkinson2000\"] = 1,\n [\"CITEREFPattersonHennessy2014\"] = 1,\n [\"CITEREFPressTeukolskyVetterlingFlannery2007\"] = 1,\n [\"CITEREFRandell1982\"] = 1,\n [\"CITEREFRojas1997\"] = 1,\n [\"CITEREFRojas2014\"] = 1,\n [\"CITEREFSavard2018\"] = 2,\n [\"CITEREFSeverance1998\"] = 1,\n [\"CITEREFShewchuk1997\"] = 1,\n [\"CITEREFSkeel1992\"] = 1,\n [\"CITEREFSmith1997\"] = 1,\n [\"CITEREFSteil2008\"] = 1,\n [\"CITEREFSterbenz1974\"] = 2,\n [\"CITEREFWilkinson1963\"] = 1,\n [\"CITEREFWilkinson1965\"] = 1,\n [\"CITEREFWilkinson2003\"] = 1,\n [\"CITEREFZehendner2008\"] = 1,\n [\"Exception_handling\"] = 1,\n [\"Floating_point_exception\"] = 1,\n [\"Hidden_bit\"] = 1,\n [\"IEEE_754\"] = 1,\n [\"NB-Exponent\"] = 1,\n [\"NB-Significand\"] = 1,\n [\"Representable_numbers\"] = 1,\n [\"muller_et_al_pg_16\"] = 1,\n}\ntemplate_list = table#1 {\n [\"\"] = 1,\n [\"!\"] = 12,\n [\"10^\"] = 1,\n [\"=\"] = 3,\n [\"Anchor\"] = 7,\n [\"Block indent\"] = 4,\n [\"Citation\"] = 3,\n [\"Citation needed\"] = 3,\n [\"Cite arXiv\"] = 2,\n [\"Cite book\"] = 22,\n [\"Cite conference\"] = 4,\n [\"Cite journal\"] = 7,\n [\"Cite tech report\"] = 1,\n [\"Cite web\"] = 38,\n [\"Clarify\"] = 1,\n [\"Code\"] = 1,\n [\"Data types\"] = 1,\n [\"Div col\"] = 1,\n [\"Div col end\"] = 1,\n [\"E\"] = 2,\n [\"Floating-point\"] = 2,\n [\"Fontcolor\"] = 55,\n [\"Further\"] = 1,\n [\"ISBN\"] = 1,\n [\"Main\"] = 1,\n [\"Math\"] = 20,\n [\"Mvar\"] = 8,\n [\"Overline\"] = 1,\n [\"Redirect\"] = 1,\n [\"Reflist\"] = 2,\n [\"Rp\"] = 6,\n [\"See also\"] = 1,\n [\"Sfn\"] = 1,\n [\"Short description\"] = 1,\n [\"US patent reference\"] = 1,\n [\"Ulist\"] = 1,\n [\"Use dmy dates\"] = 1,\n [\"Val\"] = 21,\n [\"Var\"] = 37,\n [\"Webarchive\"] = 1,\n}\narticle_whitelist = table#1 {\n}\nciteref_patterns = table#1 {\n}\n"},"cachereport":{"origin":"mw-api-ext.eqiad.main-df448499c-qfz9n","timestamp":"20250318154823","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Floating-point arithmetic","url":"https:\/\/en.wikipedia.org\/wiki\/Floating-point_arithmetic","sameAs":"http:\/\/www.wikidata.org\/entity\/Q117879","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q117879","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2001-11-11T18:56:25Z","dateModified":"2025-03-12T08:46:33Z","image":"https:\/\/upload.wikimedia.org\/wikipedia\/commons\/4\/4c\/Z3_Deutsches_Museum.JPG","headline":"computer format for representing rational numbers"}</script> </body> </html>