CINXE.COM
Vector processor - Wikipedia
<!DOCTYPE html> <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr"> <head> <meta charset="UTF-8"> <title>Vector processor - Wikipedia</title> <script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy", "wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"14487e42-73ff-486c-9ce4-01ebd889d71f","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Vector_processor","wgTitle":"Vector processor","wgCurRevisionId":1255528928,"wgRevisionId":1255528928,"wgArticleId":58205,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description matches Wikidata","All articles with unsourced statements","Articles with unsourced statements from July 2023","Articles needing additional references from July 2023","All articles needing additional references","Articles containing potentially dated statements from 2016","All articles containing potentially dated statements","Articles with unsourced statements from June 2021", "Wikipedia articles needing clarification from December 2023","Articles with specifically marked weasel-worded phrases from November 2021","Wikipedia articles with style issues from November 2021","All articles with style issues","Central processing unit","Coprocessors","Parallel computing","Vector supercomputers"],"wgPageViewLanguage":"en","wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgRelevantPageName":"Vector_processor","wgRelevantArticleId":58205,"wgIsProbablyEditable":true,"wgRelevantPageIsProbablyEditable":true,"wgRestrictionEdit":[],"wgRestrictionMove":[],"wgNoticeProject":"wikipedia","wgCiteReferencePreviewsActive":false,"wgFlaggedRevsParams":{"tags":{"status":{"levels":1}}},"wgMediaViewerOnClick":true,"wgMediaViewerEnabledByDefault":true,"wgPopupsFlags":0,"wgVisualEditor":{"pageLanguageCode":"en","pageLanguageDir":"ltr","pageVariantFallbacks":"en"},"wgMFDisplayWikibaseDescriptions":{"search":true,"watchlist":true,"tagline":false,"nearby":true}, "wgWMESchemaEditAttemptStepOversample":false,"wgWMEPageLength":60000,"wgRelatedArticlesCompat":[],"wgCentralAuthMobileDomain":false,"wgEditSubmitButtonLabelPublish":true,"wgULSPosition":"interlanguage","wgULSisCompactLinksEnabled":false,"wgVector2022LanguageInHeader":true,"wgULSisLanguageSelectorEmpty":false,"wgWikibaseItemId":"Q919509","wgCheckUserClientHintsHeadersJsApi":["brands","architecture","bitness","fullVersionList","mobile","model","platform","platformVersion"],"GEHomepageSuggestedEditsEnableTopics":true,"wgGETopicsMatchModeEnabled":false,"wgGEStructuredTaskRejectionReasonTextInputEnabled":false,"wgGELevelingUpEnabledForUser":false};RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.styles":"ready","ext.globalCssJs.user":"ready","user":"ready","user.options":"loading","ext.cite.styles":"ready","ext.pygments":"ready","ext.math.styles":"ready","skins.vector.search.codex.styles":"ready","skins.vector.styles":"ready","skins.vector.icons":"ready", "jquery.makeCollapsible.styles":"ready","ext.wikimediamessages.styles":"ready","ext.visualEditor.desktopArticleTarget.noscript":"ready","ext.uls.interlanguage":"ready","wikibase.client.init":"ready","ext.wikimediaBadges":"ready"};RLPAGEMODULES=["ext.cite.ux-enhancements","ext.pygments.view","mediawiki.page.media","site","mediawiki.page.ready","jquery.makeCollapsible","mediawiki.toc","skins.vector.js","ext.centralNotice.geoIP","ext.centralNotice.startUp","ext.gadget.ReferenceTooltips","ext.gadget.switcher","ext.urlShortener.toolbar","ext.centralauth.centralautologin","mmv.bootstrap","ext.popups","ext.visualEditor.desktopArticleTarget.init","ext.visualEditor.targetLoader","ext.echo.centralauth","ext.eventLogging","ext.wikimediaEvents","ext.navigationTiming","ext.uls.interface","ext.cx.eventlogging.campaigns","ext.cx.uls.quick.actions","wikibase.client.vector-2022","ext.checkUser.clientHints","ext.growthExperiments.SuggestedEditSession","wikibase.sidebar.tracking"];</script> <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return["user.options@12s5i",function($,jQuery,require,module){mw.user.tokens.set({"patrolToken":"+\\","watchToken":"+\\","csrfToken":"+\\"}); }];});});</script> <link rel="stylesheet" href="/w/load.php?lang=en&modules=ext.cite.styles%7Cext.math.styles%7Cext.pygments%2CwikimediaBadges%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&only=styles&skin=vector-2022"> <script async="" src="/w/load.php?lang=en&modules=startup&only=scripts&raw=1&skin=vector-2022"></script> <meta name="ResourceLoaderDynamicStyles" content=""> <link rel="stylesheet" href="/w/load.php?lang=en&modules=site.styles&only=styles&skin=vector-2022"> <meta name="generator" content="MediaWiki 1.44.0-wmf.4"> <meta name="referrer" content="origin"> <meta name="referrer" content="origin-when-cross-origin"> <meta name="robots" content="max-image-preview:standard"> <meta name="format-detection" content="telephone=no"> <meta name="viewport" content="width=1120"> <meta property="og:title" content="Vector processor - Wikipedia"> <meta property="og:type" content="website"> <link rel="preconnect" href="//upload.wikimedia.org"> <link rel="alternate" media="only screen and (max-width: 640px)" href="//en.m.wikipedia.org/wiki/Vector_processor"> <link rel="alternate" type="application/x-wiki" title="Edit this page" href="/w/index.php?title=Vector_processor&action=edit"> <link rel="apple-touch-icon" href="/static/apple-touch/wikipedia.png"> <link rel="icon" href="/static/favicon/wikipedia.ico"> <link rel="search" type="application/opensearchdescription+xml" href="/w/rest.php/v1/search" title="Wikipedia (en)"> <link rel="EditURI" type="application/rsd+xml" href="//en.wikipedia.org/w/api.php?action=rsd"> <link rel="canonical" href="https://en.wikipedia.org/wiki/Vector_processor"> <link rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/deed.en"> <link rel="alternate" type="application/atom+xml" title="Wikipedia Atom feed" href="/w/index.php?title=Special:RecentChanges&feed=atom"> <link rel="dns-prefetch" href="//meta.wikimedia.org" /> <link rel="dns-prefetch" href="//login.wikimedia.org"> </head> <body class="skin--responsive skin-vector skin-vector-search-vue mediawiki ltr sitedir-ltr mw-hide-empty-elt ns-0 ns-subject mw-editable page-Vector_processor rootpage-Vector_processor skin-vector-2022 action-view"><a class="mw-jump-link" href="#bodyContent">Jump to content</a> <div class="vector-header-container"> <header class="vector-header mw-header"> <div class="vector-header-start"> <nav class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-dropdown" class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" > <input type="checkbox" id="vector-main-menu-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-main-menu-dropdown" class="vector-dropdown-checkbox " aria-label="Main menu" > <label id="vector-main-menu-dropdown-label" for="vector-main-menu-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span> <span class="vector-dropdown-label-text">Main menu</span> </label> <div class="vector-dropdown-content"> <div id="vector-main-menu-unpinned-container" class="vector-unpinned-container"> <div id="vector-main-menu" class="vector-main-menu vector-pinnable-element"> <div class="vector-pinnable-header vector-main-menu-pinnable-header vector-pinnable-header-unpinned" data-feature-name="main-menu-pinned" data-pinnable-element-id="vector-main-menu" data-pinned-container-id="vector-main-menu-pinned-container" data-unpinned-container-id="vector-main-menu-unpinned-container" > <div class="vector-pinnable-header-label">Main menu</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-main-menu.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-main-menu.unpin">hide</button> </div> <div id="p-navigation" class="vector-menu mw-portlet mw-portlet-navigation" > <div class="vector-menu-heading"> Navigation </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-mainpage-description" class="mw-list-item"><a href="/wiki/Main_Page" title="Visit the main page [z]" accesskey="z"><span>Main page</span></a></li><li id="n-contents" class="mw-list-item"><a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a></li><li id="n-currentevents" class="mw-list-item"><a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a></li><li id="n-randompage" class="mw-list-item"><a href="/wiki/Special:Random" title="Visit a randomly selected article [x]" accesskey="x"><span>Random article</span></a></li><li id="n-aboutsite" class="mw-list-item"><a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a></li><li id="n-contactpage" class="mw-list-item"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a></li> </ul> </div> </div> <div id="p-interaction" class="vector-menu mw-portlet mw-portlet-interaction" > <div class="vector-menu-heading"> Contribute </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="n-help" class="mw-list-item"><a href="/wiki/Help:Contents" title="Guidance on how to use and edit Wikipedia"><span>Help</span></a></li><li id="n-introduction" class="mw-list-item"><a href="/wiki/Help:Introduction" title="Learn how to edit Wikipedia"><span>Learn to edit</span></a></li><li id="n-portal" class="mw-list-item"><a href="/wiki/Wikipedia:Community_portal" title="The hub for editors"><span>Community portal</span></a></li><li id="n-recentchanges" class="mw-list-item"><a href="/wiki/Special:RecentChanges" title="A list of recent changes to Wikipedia [r]" accesskey="r"><span>Recent changes</span></a></li><li id="n-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_upload_wizard" title="Add images or other media for use on Wikipedia"><span>Upload file</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> <a href="/wiki/Main_Page" class="mw-logo"> <img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50"> <span class="mw-logo-container skin-invert"> <img class="mw-logo-wordmark" alt="Wikipedia" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"> <img class="mw-logo-tagline" alt="The Free Encyclopedia" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" width="117" height="13" style="width: 7.3125em; height: 0.8125em;"> </span> </a> </div> <div class="vector-header-end"> <div id="p-search" role="search" class="vector-search-box-vue vector-search-box-collapses vector-search-box-show-thumbnail vector-search-box-auto-expand-width vector-search-box"> <a href="/wiki/Special:Search" class="cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only search-toggle" title="Search Wikipedia [f]" accesskey="f"><span class="vector-icon mw-ui-icon-search mw-ui-icon-wikimedia-search"></span> <span>Search</span> </a> <div class="vector-typeahead-search-container"> <div class="cdx-typeahead-search cdx-typeahead-search--show-thumbnail cdx-typeahead-search--auto-expand-width"> <form action="/w/index.php" id="searchform" class="cdx-search-input cdx-search-input--has-end-button"> <div id="simpleSearch" class="cdx-search-input__input-wrapper" data-search-loc="header-moved"> <div class="cdx-text-input cdx-text-input--has-start-icon"> <input class="cdx-text-input__input" type="search" name="search" placeholder="Search Wikipedia" aria-label="Search Wikipedia" autocapitalize="sentences" title="Search Wikipedia [f]" accesskey="f" id="searchInput" > <span class="cdx-text-input__icon cdx-text-input__start-icon"></span> </div> <input type="hidden" name="title" value="Special:Search"> </div> <button class="cdx-button cdx-search-input__end-button">Search</button> </form> </div> </div> </div> <nav class="vector-user-links vector-user-links-wide" aria-label="Personal tools"> <div class="vector-user-links-main"> <div id="p-vector-user-menu-preferences" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-userpage" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-dropdown" class="vector-dropdown " title="Change the appearance of the page's font size, width, and color" > <input type="checkbox" id="vector-appearance-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-appearance-dropdown" class="vector-dropdown-checkbox " aria-label="Appearance" > <label id="vector-appearance-dropdown-label" for="vector-appearance-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-appearance mw-ui-icon-wikimedia-appearance"></span> <span class="vector-dropdown-label-text">Appearance</span> </label> <div class="vector-dropdown-content"> <div id="vector-appearance-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <div id="p-vector-user-menu-notifications" class="vector-menu mw-portlet emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> <div id="p-vector-user-menu-overflow" class="vector-menu mw-portlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en" class=""><span>Donate</span></a> </li> <li id="pt-createaccount-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:CreateAccount&returnto=Vector+processor" title="You are encouraged to create an account and log in; however, it is not mandatory" class=""><span>Create account</span></a> </li> <li id="pt-login-2" class="user-links-collapsible-item mw-list-item user-links-collapsible-item"><a data-mw="interface" href="/w/index.php?title=Special:UserLogin&returnto=Vector+processor" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o" class=""><span>Log in</span></a> </li> </ul> </div> </div> </div> <div id="vector-user-links-dropdown" class="vector-dropdown vector-user-menu vector-button-flush-right vector-user-menu-logged-out" title="Log in and more options" > <input type="checkbox" id="vector-user-links-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-user-links-dropdown" class="vector-dropdown-checkbox " aria-label="Personal tools" > <label id="vector-user-links-dropdown-label" for="vector-user-links-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></span> <span class="vector-dropdown-label-text">Personal tools</span> </label> <div class="vector-dropdown-content"> <div id="p-personal" class="vector-menu mw-portlet mw-portlet-personal user-links-collapsible-item" title="User menu" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-sitesupport" class="user-links-collapsible-item mw-list-item"><a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en"><span>Donate</span></a></li><li id="pt-createaccount" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:CreateAccount&returnto=Vector+processor" title="You are encouraged to create an account and log in; however, it is not mandatory"><span class="vector-icon mw-ui-icon-userAdd mw-ui-icon-wikimedia-userAdd"></span> <span>Create account</span></a></li><li id="pt-login" class="user-links-collapsible-item mw-list-item"><a href="/w/index.php?title=Special:UserLogin&returnto=Vector+processor" title="You're encouraged to log in; however, it's not mandatory. [o]" accesskey="o"><span class="vector-icon mw-ui-icon-logIn mw-ui-icon-wikimedia-logIn"></span> <span>Log in</span></a></li> </ul> </div> </div> <div id="p-user-menu-anon-editor" class="vector-menu mw-portlet mw-portlet-user-menu-anon-editor" > <div class="vector-menu-heading"> Pages for logged out editors <a href="/wiki/Help:Introduction" aria-label="Learn more about editing"><span>learn more</span></a> </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="pt-anoncontribs" class="mw-list-item"><a href="/wiki/Special:MyContributions" title="A list of edits made from this IP address [y]" accesskey="y"><span>Contributions</span></a></li><li id="pt-anontalk" class="mw-list-item"><a href="/wiki/Special:MyTalk" title="Discussion about edits from this IP address [n]" accesskey="n"><span>Talk</span></a></li> </ul> </div> </div> </div> </div> </nav> </div> </header> </div> <div class="mw-page-container"> <div class="mw-page-container-inner"> <div class="vector-sitenotice-container"> <div id="siteNotice"><!-- CentralNotice --></div> </div> <div class="vector-column-start"> <div class="vector-main-menu-container"> <div id="mw-navigation"> <nav id="mw-panel" class="vector-main-menu-landmark" aria-label="Site"> <div id="vector-main-menu-pinned-container" class="vector-pinned-container"> </div> </nav> </div> </div> <div class="vector-sticky-pinned-container"> <nav id="mw-panel-toc" aria-label="Contents" data-event-name="ui.sidebar-toc" class="mw-table-of-contents-container vector-toc-landmark"> <div id="vector-toc-pinned-container" class="vector-pinned-container"> <div id="vector-toc" class="vector-toc vector-pinnable-element"> <div class="vector-pinnable-header vector-toc-pinnable-header vector-pinnable-header-pinned" data-feature-name="toc-pinned" data-pinnable-element-id="vector-toc" > <h2 class="vector-pinnable-header-label">Contents</h2> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-toc.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-toc.unpin">hide</button> </div> <ul class="vector-toc-contents" id="mw-panel-toc-list"> <li id="toc-mw-content-text" class="vector-toc-list-item vector-toc-level-1"> <a href="#" class="vector-toc-link"> <div class="vector-toc-text">(Top)</div> </a> </li> <li id="toc-History" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#History"> <div class="vector-toc-text"> <span class="vector-toc-numb">1</span> <span>History</span> </div> </a> <button aria-controls="toc-History-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle History subsection</span> </button> <ul id="toc-History-sublist" class="vector-toc-list"> <li id="toc-Early_research_and_development" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Early_research_and_development"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.1</span> <span>Early research and development</span> </div> </a> <ul id="toc-Early_research_and_development-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Computer_for_operations_with_functions" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Computer_for_operations_with_functions"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.2</span> <span>Computer for operations with functions</span> </div> </a> <ul id="toc-Computer_for_operations_with_functions-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Supercomputers" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Supercomputers"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.3</span> <span>Supercomputers</span> </div> </a> <ul id="toc-Supercomputers-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-GPU" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#GPU"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.4</span> <span>GPU</span> </div> </a> <ul id="toc-GPU-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Recent_development" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Recent_development"> <div class="vector-toc-text"> <span class="vector-toc-numb">1.5</span> <span>Recent development</span> </div> </a> <ul id="toc-Recent_development-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Comparison_with_modern_architectures" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Comparison_with_modern_architectures"> <div class="vector-toc-text"> <span class="vector-toc-numb">2</span> <span>Comparison with modern architectures</span> </div> </a> <button aria-controls="toc-Comparison_with_modern_architectures-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Comparison with modern architectures subsection</span> </button> <ul id="toc-Comparison_with_modern_architectures-sublist" class="vector-toc-list"> <li id="toc-Difference_between_SIMD_and_vector_processors" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Difference_between_SIMD_and_vector_processors"> <div class="vector-toc-text"> <span class="vector-toc-numb">2.1</span> <span>Difference between SIMD and vector processors</span> </div> </a> <ul id="toc-Difference_between_SIMD_and_vector_processors-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Description" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Description"> <div class="vector-toc-text"> <span class="vector-toc-numb">3</span> <span>Description</span> </div> </a> <button aria-controls="toc-Description-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Description subsection</span> </button> <ul id="toc-Description-sublist" class="vector-toc-list"> <li id="toc-Vector_instructions" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Vector_instructions"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.1</span> <span>Vector instructions</span> </div> </a> <ul id="toc-Vector_instructions-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Vector_instruction_example" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Vector_instruction_example"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.2</span> <span>Vector instruction example</span> </div> </a> <ul id="toc-Vector_instruction_example-sublist" class="vector-toc-list"> <li id="toc-Scalar_assembler" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Scalar_assembler"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.2.1</span> <span>Scalar assembler</span> </div> </a> <ul id="toc-Scalar_assembler-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Pure_(non-predicated,_packed)_SIMD" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Pure_(non-predicated,_packed)_SIMD"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.2.2</span> <span>Pure (non-predicated, packed) SIMD</span> </div> </a> <ul id="toc-Pure_(non-predicated,_packed)_SIMD-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Predicated_SIMD" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Predicated_SIMD"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.2.3</span> <span>Predicated SIMD</span> </div> </a> <ul id="toc-Predicated_SIMD-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Pure_(true)_vector_ISA" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Pure_(true)_vector_ISA"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.2.4</span> <span>Pure (true) vector ISA</span> </div> </a> <ul id="toc-Pure_(true)_vector_ISA-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Vector_reduction_example" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Vector_reduction_example"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3</span> <span>Vector reduction example</span> </div> </a> <ul id="toc-Vector_reduction_example-sublist" class="vector-toc-list"> <li id="toc-Scalar_assembler_2" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Scalar_assembler_2"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3.1</span> <span>Scalar assembler</span> </div> </a> <ul id="toc-Scalar_assembler_2-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-SIMD_reduction" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#SIMD_reduction"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3.2</span> <span>SIMD reduction</span> </div> </a> <ul id="toc-SIMD_reduction-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Vector_ISA_reduction" class="vector-toc-list-item vector-toc-level-3"> <a class="vector-toc-link" href="#Vector_ISA_reduction"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.3.3</span> <span>Vector ISA reduction</span> </div> </a> <ul id="toc-Vector_ISA_reduction-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Insights_from_examples" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Insights_from_examples"> <div class="vector-toc-text"> <span class="vector-toc-numb">3.4</span> <span>Insights from examples</span> </div> </a> <ul id="toc-Insights_from_examples-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Vector_processor_features" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Vector_processor_features"> <div class="vector-toc-text"> <span class="vector-toc-numb">4</span> <span>Vector processor features</span> </div> </a> <button aria-controls="toc-Vector_processor_features-sublist" class="cdx-button cdx-button--weight-quiet cdx-button--icon-only vector-toc-toggle"> <span class="vector-icon mw-ui-icon-wikimedia-expand"></span> <span>Toggle Vector processor features subsection</span> </button> <ul id="toc-Vector_processor_features-sublist" class="vector-toc-list"> <li id="toc-GPU_vector_processing_features" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#GPU_vector_processing_features"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.1</span> <span>GPU vector processing features</span> </div> </a> <ul id="toc-GPU_vector_processing_features-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-Fault_(or_Fail)_First" class="vector-toc-list-item vector-toc-level-2"> <a class="vector-toc-link" href="#Fault_(or_Fail)_First"> <div class="vector-toc-text"> <span class="vector-toc-numb">4.2</span> <span>Fault (or Fail) First</span> </div> </a> <ul id="toc-Fault_(or_Fail)_First-sublist" class="vector-toc-list"> </ul> </li> </ul> </li> <li id="toc-Performance_and_speed_up" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#Performance_and_speed_up"> <div class="vector-toc-text"> <span class="vector-toc-numb">5</span> <span>Performance and speed up</span> </div> </a> <ul id="toc-Performance_and_speed_up-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-See_also" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#See_also"> <div class="vector-toc-text"> <span class="vector-toc-numb">6</span> <span>See also</span> </div> </a> <ul id="toc-See_also-sublist" class="vector-toc-list"> </ul> </li> <li id="toc-References" class="vector-toc-list-item vector-toc-level-1 vector-toc-list-item-expanded"> <a class="vector-toc-link" href="#References"> <div class="vector-toc-text"> <span class="vector-toc-numb">7</span> <span>References</span> </div> </a> <ul id="toc-References-sublist" class="vector-toc-list"> </ul> </li> </ul> </div> </div> </nav> </div> </div> <div class="mw-content-container"> <main id="content" class="mw-body"> <header class="mw-body-header vector-page-titlebar"> <nav aria-label="Contents" class="vector-toc-landmark"> <div id="vector-page-titlebar-toc" class="vector-dropdown vector-page-titlebar-toc vector-button-flush-left" > <input type="checkbox" id="vector-page-titlebar-toc-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-titlebar-toc" class="vector-dropdown-checkbox " aria-label="Toggle the table of contents" > <label id="vector-page-titlebar-toc-label" for="vector-page-titlebar-toc-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only " aria-hidden="true" ><span class="vector-icon mw-ui-icon-listBullet mw-ui-icon-wikimedia-listBullet"></span> <span class="vector-dropdown-label-text">Toggle the table of contents</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-titlebar-toc-unpinned-container" class="vector-unpinned-container"> </div> </div> </div> </nav> <h1 id="firstHeading" class="firstHeading mw-first-heading"><span class="mw-page-title-main">Vector processor</span></h1> <div id="p-lang-btn" class="vector-dropdown mw-portlet mw-portlet-lang" > <input type="checkbox" id="p-lang-btn-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-p-lang-btn" class="vector-dropdown-checkbox mw-interlanguage-selector" aria-label="Go to an article in another language. Available in 25 languages" > <label id="p-lang-btn-label" for="p-lang-btn-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--action-progressive mw-portlet-lang-heading-25" aria-hidden="true" ><span class="vector-icon mw-ui-icon-language-progressive mw-ui-icon-wikimedia-language-progressive"></span> <span class="vector-dropdown-label-text">25 languages</span> </label> <div class="vector-dropdown-content"> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li class="interlanguage-link interwiki-ar mw-list-item"><a href="https://ar.wikipedia.org/wiki/%D9%85%D8%B9%D8%A7%D9%84%D8%AC_%D9%85%D8%AA%D8%AC%D9%87%D9%8A" title="معالج متجهي – Arabic" lang="ar" hreflang="ar" data-title="معالج متجهي" data-language-autonym="العربية" data-language-local-name="Arabic" class="interlanguage-link-target"><span>العربية</span></a></li><li class="interlanguage-link interwiki-az mw-list-item"><a href="https://az.wikipedia.org/wiki/Vektor_prosessoru" title="Vektor prosessoru – Azerbaijani" lang="az" hreflang="az" data-title="Vektor prosessoru" data-language-autonym="Azərbaycanca" data-language-local-name="Azerbaijani" class="interlanguage-link-target"><span>Azərbaycanca</span></a></li><li class="interlanguage-link interwiki-ca mw-list-item"><a href="https://ca.wikipedia.org/wiki/Processador_vectorial" title="Processador vectorial – Catalan" lang="ca" hreflang="ca" data-title="Processador vectorial" data-language-autonym="Català" data-language-local-name="Catalan" class="interlanguage-link-target"><span>Català</span></a></li><li class="interlanguage-link interwiki-cs mw-list-item"><a href="https://cs.wikipedia.org/wiki/Vektorov%C3%BD_procesor" title="Vektorový procesor – Czech" lang="cs" hreflang="cs" data-title="Vektorový procesor" data-language-autonym="Čeština" data-language-local-name="Czech" class="interlanguage-link-target"><span>Čeština</span></a></li><li class="interlanguage-link interwiki-de mw-list-item"><a href="https://de.wikipedia.org/wiki/Vektorprozessor" title="Vektorprozessor – German" lang="de" hreflang="de" data-title="Vektorprozessor" data-language-autonym="Deutsch" data-language-local-name="German" class="interlanguage-link-target"><span>Deutsch</span></a></li><li class="interlanguage-link interwiki-el mw-list-item"><a href="https://el.wikipedia.org/wiki/%CE%94%CE%B9%CE%B1%CE%BD%CF%85%CF%83%CE%BC%CE%B1%CF%84%CE%B9%CE%BA%CF%8C%CF%82_%CE%B5%CF%80%CE%B5%CE%BE%CE%B5%CF%81%CE%B3%CE%B1%CF%83%CF%84%CE%AE%CF%82" title="Διανυσματικός επεξεργαστής – Greek" lang="el" hreflang="el" data-title="Διανυσματικός επεξεργαστής" data-language-autonym="Ελληνικά" data-language-local-name="Greek" class="interlanguage-link-target"><span>Ελληνικά</span></a></li><li class="interlanguage-link interwiki-es mw-list-item"><a href="https://es.wikipedia.org/wiki/Procesador_vectorial" title="Procesador vectorial – Spanish" lang="es" hreflang="es" data-title="Procesador vectorial" data-language-autonym="Español" data-language-local-name="Spanish" class="interlanguage-link-target"><span>Español</span></a></li><li class="interlanguage-link interwiki-fa mw-list-item"><a href="https://fa.wikipedia.org/wiki/%D9%BE%D8%B1%D8%AF%D8%A7%D8%B2%D9%86%D8%AF%D9%87_%D8%A8%D8%B1%D8%AF%D8%A7%D8%B1%DB%8C" title="پردازنده برداری – Persian" lang="fa" hreflang="fa" data-title="پردازنده برداری" data-language-autonym="فارسی" data-language-local-name="Persian" class="interlanguage-link-target"><span>فارسی</span></a></li><li class="interlanguage-link interwiki-fr mw-list-item"><a href="https://fr.wikipedia.org/wiki/Processeur_vectoriel" title="Processeur vectoriel – French" lang="fr" hreflang="fr" data-title="Processeur vectoriel" data-language-autonym="Français" data-language-local-name="French" class="interlanguage-link-target"><span>Français</span></a></li><li class="interlanguage-link interwiki-ga mw-list-item"><a href="https://ga.wikipedia.org/wiki/Pr%C3%B3ise%C3%A1la%C3%AD_eagar" title="Próiseálaí eagar – Irish" lang="ga" hreflang="ga" data-title="Próiseálaí eagar" data-language-autonym="Gaeilge" data-language-local-name="Irish" class="interlanguage-link-target"><span>Gaeilge</span></a></li><li class="interlanguage-link interwiki-ko mw-list-item"><a href="https://ko.wikipedia.org/wiki/%EB%B2%A1%ED%84%B0_%ED%94%84%EB%A1%9C%EC%84%B8%EC%84%9C" title="벡터 프로세서 – Korean" lang="ko" hreflang="ko" data-title="벡터 프로세서" data-language-autonym="한국어" data-language-local-name="Korean" class="interlanguage-link-target"><span>한국어</span></a></li><li class="interlanguage-link interwiki-id mw-list-item"><a href="https://id.wikipedia.org/wiki/Prosesor_vektor" title="Prosesor vektor – Indonesian" lang="id" hreflang="id" data-title="Prosesor vektor" data-language-autonym="Bahasa Indonesia" data-language-local-name="Indonesian" class="interlanguage-link-target"><span>Bahasa Indonesia</span></a></li><li class="interlanguage-link interwiki-it mw-list-item"><a href="https://it.wikipedia.org/wiki/Processore_vettoriale" title="Processore vettoriale – Italian" lang="it" hreflang="it" data-title="Processore vettoriale" data-language-autonym="Italiano" data-language-local-name="Italian" class="interlanguage-link-target"><span>Italiano</span></a></li><li class="interlanguage-link interwiki-hu mw-list-item"><a href="https://hu.wikipedia.org/wiki/Vektorprocesszor" title="Vektorprocesszor – Hungarian" lang="hu" hreflang="hu" data-title="Vektorprocesszor" data-language-autonym="Magyar" data-language-local-name="Hungarian" class="interlanguage-link-target"><span>Magyar</span></a></li><li class="interlanguage-link interwiki-nl mw-list-item"><a href="https://nl.wikipedia.org/wiki/Array_processor" title="Array processor – Dutch" lang="nl" hreflang="nl" data-title="Array processor" data-language-autonym="Nederlands" data-language-local-name="Dutch" class="interlanguage-link-target"><span>Nederlands</span></a></li><li class="interlanguage-link interwiki-ja mw-list-item"><a href="https://ja.wikipedia.org/wiki/%E3%83%99%E3%82%AF%E3%83%88%E3%83%AB%E8%A8%88%E7%AE%97%E6%A9%9F" title="ベクトル計算機 – Japanese" lang="ja" hreflang="ja" data-title="ベクトル計算機" data-language-autonym="日本語" data-language-local-name="Japanese" class="interlanguage-link-target"><span>日本語</span></a></li><li class="interlanguage-link interwiki-no mw-list-item"><a href="https://no.wikipedia.org/wiki/Vektorprosessor" title="Vektorprosessor – Norwegian Bokmål" lang="nb" hreflang="nb" data-title="Vektorprosessor" data-language-autonym="Norsk bokmål" data-language-local-name="Norwegian Bokmål" class="interlanguage-link-target"><span>Norsk bokmål</span></a></li><li class="interlanguage-link interwiki-pl mw-list-item"><a href="https://pl.wikipedia.org/wiki/Procesor_wektorowy" title="Procesor wektorowy – Polish" lang="pl" hreflang="pl" data-title="Procesor wektorowy" data-language-autonym="Polski" data-language-local-name="Polish" class="interlanguage-link-target"><span>Polski</span></a></li><li class="interlanguage-link interwiki-pt mw-list-item"><a href="https://pt.wikipedia.org/wiki/Processador_vetorial" title="Processador vetorial – Portuguese" lang="pt" hreflang="pt" data-title="Processador vetorial" data-language-autonym="Português" data-language-local-name="Portuguese" class="interlanguage-link-target"><span>Português</span></a></li><li class="interlanguage-link interwiki-ro mw-list-item"><a href="https://ro.wikipedia.org/wiki/Procesor_vectorial" title="Procesor vectorial – Romanian" lang="ro" hreflang="ro" data-title="Procesor vectorial" data-language-autonym="Română" data-language-local-name="Romanian" class="interlanguage-link-target"><span>Română</span></a></li><li class="interlanguage-link interwiki-ru mw-list-item"><a href="https://ru.wikipedia.org/wiki/%D0%92%D0%B5%D0%BA%D1%82%D0%BE%D1%80%D0%BD%D1%8B%D0%B9_%D0%BF%D1%80%D0%BE%D1%86%D0%B5%D1%81%D1%81%D0%BE%D1%80" title="Векторный процессор – Russian" lang="ru" hreflang="ru" data-title="Векторный процессор" data-language-autonym="Русский" data-language-local-name="Russian" class="interlanguage-link-target"><span>Русский</span></a></li><li class="interlanguage-link interwiki-sk mw-list-item"><a href="https://sk.wikipedia.org/wiki/Vektorov%C3%BD_procesor" title="Vektorový procesor – Slovak" lang="sk" hreflang="sk" data-title="Vektorový procesor" data-language-autonym="Slovenčina" data-language-local-name="Slovak" class="interlanguage-link-target"><span>Slovenčina</span></a></li><li class="interlanguage-link interwiki-fi mw-list-item"><a href="https://fi.wikipedia.org/wiki/Vektorisuoritin" title="Vektorisuoritin – Finnish" lang="fi" hreflang="fi" data-title="Vektorisuoritin" data-language-autonym="Suomi" data-language-local-name="Finnish" class="interlanguage-link-target"><span>Suomi</span></a></li><li class="interlanguage-link interwiki-uk mw-list-item"><a href="https://uk.wikipedia.org/wiki/%D0%92%D0%B5%D0%BA%D1%82%D0%BE%D1%80%D0%BD%D0%B8%D0%B9_%D0%BF%D1%80%D0%BE%D1%86%D0%B5%D1%81%D0%BE%D1%80" title="Векторний процесор – Ukrainian" lang="uk" hreflang="uk" data-title="Векторний процесор" data-language-autonym="Українська" data-language-local-name="Ukrainian" class="interlanguage-link-target"><span>Українська</span></a></li><li class="interlanguage-link interwiki-zh mw-list-item"><a href="https://zh.wikipedia.org/wiki/%E5%90%91%E9%87%8F%E5%A4%84%E7%90%86%E5%99%A8" title="向量处理器 – Chinese" lang="zh" hreflang="zh" data-title="向量处理器" data-language-autonym="中文" data-language-local-name="Chinese" class="interlanguage-link-target"><span>中文</span></a></li> </ul> <div class="after-portlet after-portlet-lang"><span class="wb-langlinks-edit wb-langlinks-link"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q919509#sitelinks-wikipedia" title="Edit interlanguage links" class="wbc-editpage">Edit links</a></span></div> </div> </div> </div> </header> <div class="vector-page-toolbar"> <div class="vector-page-toolbar-container"> <div id="left-navigation"> <nav aria-label="Namespaces"> <div id="p-associated-pages" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-associated-pages" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-nstab-main" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Vector_processor" title="View the content page [c]" accesskey="c"><span>Article</span></a></li><li id="ca-talk" class="vector-tab-noicon mw-list-item"><a href="/wiki/Talk:Vector_processor" rel="discussion" title="Discuss improvements to the content page [t]" accesskey="t"><span>Talk</span></a></li> </ul> </div> </div> <div id="vector-variants-dropdown" class="vector-dropdown emptyPortlet" > <input type="checkbox" id="vector-variants-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-variants-dropdown" class="vector-dropdown-checkbox " aria-label="Change language variant" > <label id="vector-variants-dropdown-label" for="vector-variants-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">English</span> </label> <div class="vector-dropdown-content"> <div id="p-variants" class="vector-menu mw-portlet mw-portlet-variants emptyPortlet" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> </ul> </div> </div> </div> </div> </nav> </div> <div id="right-navigation" class="vector-collapsible"> <nav aria-label="Views"> <div id="p-views" class="vector-menu vector-menu-tabs mw-portlet mw-portlet-views" > <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-view" class="selected vector-tab-noicon mw-list-item"><a href="/wiki/Vector_processor"><span>Read</span></a></li><li id="ca-edit" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Vector_processor&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-history" class="vector-tab-noicon mw-list-item"><a href="/w/index.php?title=Vector_processor&action=history" title="Past revisions of this page [h]" accesskey="h"><span>View history</span></a></li> </ul> </div> </div> </nav> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-dropdown" class="vector-dropdown vector-page-tools-dropdown" > <input type="checkbox" id="vector-page-tools-dropdown-checkbox" role="button" aria-haspopup="true" data-event-name="ui.dropdown-vector-page-tools-dropdown" class="vector-dropdown-checkbox " aria-label="Tools" > <label id="vector-page-tools-dropdown-label" for="vector-page-tools-dropdown-checkbox" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet" aria-hidden="true" ><span class="vector-dropdown-label-text">Tools</span> </label> <div class="vector-dropdown-content"> <div id="vector-page-tools-unpinned-container" class="vector-unpinned-container"> <div id="vector-page-tools" class="vector-page-tools vector-pinnable-element"> <div class="vector-pinnable-header vector-page-tools-pinnable-header vector-pinnable-header-unpinned" data-feature-name="page-tools-pinned" data-pinnable-element-id="vector-page-tools" data-pinned-container-id="vector-page-tools-pinned-container" data-unpinned-container-id="vector-page-tools-unpinned-container" > <div class="vector-pinnable-header-label">Tools</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-page-tools.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-page-tools.unpin">hide</button> </div> <div id="p-cactions" class="vector-menu mw-portlet mw-portlet-cactions emptyPortlet vector-has-collapsible-items" title="More options" > <div class="vector-menu-heading"> Actions </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="ca-more-view" class="selected vector-more-collapsible-item mw-list-item"><a href="/wiki/Vector_processor"><span>Read</span></a></li><li id="ca-more-edit" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Vector_processor&action=edit" title="Edit this page [e]" accesskey="e"><span>Edit</span></a></li><li id="ca-more-history" class="vector-more-collapsible-item mw-list-item"><a href="/w/index.php?title=Vector_processor&action=history"><span>View history</span></a></li> </ul> </div> </div> <div id="p-tb" class="vector-menu mw-portlet mw-portlet-tb" > <div class="vector-menu-heading"> General </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-whatlinkshere" class="mw-list-item"><a href="/wiki/Special:WhatLinksHere/Vector_processor" title="List of all English Wikipedia pages containing links to this page [j]" accesskey="j"><span>What links here</span></a></li><li id="t-recentchangeslinked" class="mw-list-item"><a href="/wiki/Special:RecentChangesLinked/Vector_processor" rel="nofollow" title="Recent changes in pages linked from this page [k]" accesskey="k"><span>Related changes</span></a></li><li id="t-upload" class="mw-list-item"><a href="/wiki/Wikipedia:File_Upload_Wizard" title="Upload files [u]" accesskey="u"><span>Upload file</span></a></li><li id="t-specialpages" class="mw-list-item"><a href="/wiki/Special:SpecialPages" title="A list of all special pages [q]" accesskey="q"><span>Special pages</span></a></li><li id="t-permalink" class="mw-list-item"><a href="/w/index.php?title=Vector_processor&oldid=1255528928" title="Permanent link to this revision of this page"><span>Permanent link</span></a></li><li id="t-info" class="mw-list-item"><a href="/w/index.php?title=Vector_processor&action=info" title="More information about this page"><span>Page information</span></a></li><li id="t-cite" class="mw-list-item"><a href="/w/index.php?title=Special:CiteThisPage&page=Vector_processor&id=1255528928&wpFormIdentifier=titleform" title="Information on how to cite this page"><span>Cite this page</span></a></li><li id="t-urlshortener" class="mw-list-item"><a href="/w/index.php?title=Special:UrlShortener&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FVector_processor"><span>Get shortened URL</span></a></li><li id="t-urlshortener-qrcode" class="mw-list-item"><a href="/w/index.php?title=Special:QrCode&url=https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FVector_processor"><span>Download QR code</span></a></li> </ul> </div> </div> <div id="p-coll-print_export" class="vector-menu mw-portlet mw-portlet-coll-print_export" > <div class="vector-menu-heading"> Print/export </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="coll-download-as-rl" class="mw-list-item"><a href="/w/index.php?title=Special:DownloadAsPdf&page=Vector_processor&action=show-download-screen" title="Download this page as a PDF file"><span>Download as PDF</span></a></li><li id="t-print" class="mw-list-item"><a href="/w/index.php?title=Vector_processor&printable=yes" title="Printable version of this page [p]" accesskey="p"><span>Printable version</span></a></li> </ul> </div> </div> <div id="p-wikibase-otherprojects" class="vector-menu mw-portlet mw-portlet-wikibase-otherprojects" > <div class="vector-menu-heading"> In other projects </div> <div class="vector-menu-content"> <ul class="vector-menu-content-list"> <li id="t-wikibase" class="wb-otherproject-link wb-otherproject-wikibase-dataitem mw-list-item"><a href="https://www.wikidata.org/wiki/Special:EntityPage/Q919509" title="Structured data on this page hosted by Wikidata [g]" accesskey="g"><span>Wikidata item</span></a></li> </ul> </div> </div> </div> </div> </div> </div> </nav> </div> </div> </div> <div class="vector-column-end"> <div class="vector-sticky-pinned-container"> <nav class="vector-page-tools-landmark" aria-label="Page tools"> <div id="vector-page-tools-pinned-container" class="vector-pinned-container"> </div> </nav> <nav class="vector-appearance-landmark" aria-label="Appearance"> <div id="vector-appearance-pinned-container" class="vector-pinned-container"> <div id="vector-appearance" class="vector-appearance vector-pinnable-element"> <div class="vector-pinnable-header vector-appearance-pinnable-header vector-pinnable-header-pinned" data-feature-name="appearance-pinned" data-pinnable-element-id="vector-appearance" data-pinned-container-id="vector-appearance-pinned-container" data-unpinned-container-id="vector-appearance-unpinned-container" > <div class="vector-pinnable-header-label">Appearance</div> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-pin-button" data-event-name="pinnable-header.vector-appearance.pin">move to sidebar</button> <button class="vector-pinnable-header-toggle-button vector-pinnable-header-unpin-button" data-event-name="pinnable-header.vector-appearance.unpin">hide</button> </div> </div> </div> </nav> </div> </div> <div id="bodyContent" class="vector-body" aria-labelledby="firstHeading" data-mw-ve-target-container> <div class="vector-body-before-content"> <div class="mw-indicators"> </div> <div id="siteSub" class="noprint">From Wikipedia, the free encyclopedia</div> </div> <div id="contentSub"><div id="mw-content-subtitle"></div></div> <div id="mw-content-text" class="mw-body-content"><div class="mw-content-ltr mw-parser-output" lang="en" dir="ltr"><div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">Computer processor which works on arrays of several numbers at once</div> <style data-mw-deduplicate="TemplateStyles:r1236090951">.mw-parser-output .hatnote{font-style:italic}.mw-parser-output div.hatnote{padding-left:1.6em;margin-bottom:0.5em}.mw-parser-output .hatnote i{font-style:normal}.mw-parser-output .hatnote+link+.hatnote{margin-top:-0.5em}@media print{body.ns-0 .mw-parser-output .hatnote{display:none!important}}</style><div role="note" class="hatnote navigation-not-searchable">"Array processor" redirects here. Not to be confused with <a href="/wiki/Array_processing" title="Array processing">array processing</a>.</div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">This article is about Processors (including <a href="/wiki/GPU" class="mw-redirect" title="GPU">GPUs</a>) that were specifically designed from the ground up to handle large Vectors (Arrays). For SIMD instructions present in some general-purpose computers, see <a href="/wiki/Flynn%27s_taxonomy#Single_instruction_stream,_multiple_data_streams_(SIMD)" title="Flynn's taxonomy">Flynn's taxonomy § Single instruction stream, multiple data streams (SIMD)</a>.</div> <p>In <a href="/wiki/Computing" title="Computing">computing</a>, a <b>vector processor</b> or <b>array processor</b> is a <a href="/wiki/Central_processing_unit" title="Central processing unit">central processing unit</a> (CPU) that implements an <a href="/wiki/Instruction_set" class="mw-redirect" title="Instruction set">instruction set</a> where its <a href="/wiki/Instruction_(computer_science)" class="mw-redirect" title="Instruction (computer science)">instructions</a> are designed to operate efficiently and effectively on large <a href="/wiki/Array_data_structure" class="mw-redirect" title="Array data structure">one-dimensional arrays</a> of data called <i>vectors</i>. This is in contrast to <a href="/wiki/Scalar_processor" title="Scalar processor">scalar processors</a>, whose instructions operate on single data items only, and in contrast to some of those same scalar processors having additional <a href="/wiki/Single_instruction,_multiple_data" title="Single instruction, multiple data">single instruction, multiple data</a> (SIMD) or <a href="/wiki/SIMD_within_a_register" class="mw-redirect" title="SIMD within a register">SIMD within a register</a> (SWAR) Arithmetic Units. Vector processors can greatly improve performance on certain workloads, notably <a href="/wiki/Numerical_simulation" class="mw-redirect" title="Numerical simulation">numerical simulation</a> and similar tasks. Vector processing techniques also operate in <a href="/wiki/Video_game_console" title="Video game console">video-game console</a> hardware and in <a href="/wiki/Graphics_accelerator" class="mw-redirect" title="Graphics accelerator">graphics accelerators</a>. </p><p>Vector machines appeared in the early 1970s and dominated <a href="/wiki/Supercomputer" title="Supercomputer">supercomputer</a> design through the 1970s into the 1990s, notably the various <a href="/wiki/Cray" title="Cray">Cray</a> platforms. The rapid fall in the <a href="/wiki/Price-to-performance_ratio" class="mw-redirect" title="Price-to-performance ratio">price-to-performance ratio</a> of conventional <a href="/wiki/Microprocessor" title="Microprocessor">microprocessor</a> designs led to a decline in vector supercomputers during the 1990s. </p> <meta property="mw:PageProp/toc" /> <div class="mw-heading mw-heading2"><h2 id="History">History</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=1" title="Edit section: History"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <div class="mw-heading mw-heading3"><h3 id="Early_research_and_development">Early research and development</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=2" title="Edit section: Early research and development"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Vector processing development began in the early 1960s at the <a href="/wiki/Westinghouse_Electric_Corporation" title="Westinghouse Electric Corporation">Westinghouse Electric Corporation</a> in their <i>Solomon</i> project. Solomon's goal was to dramatically increase math performance by using a large number of simple <a href="/wiki/Coprocessor" title="Coprocessor">coprocessors</a> under the control of a single master <a href="/wiki/Central_processing_unit" title="Central processing unit">Central processing unit</a> (CPU). The CPU fed a single common instruction to all of the <a href="/wiki/Arithmetic_logic_unit" title="Arithmetic logic unit">arithmetic logic units</a> (ALUs), one per cycle, but with a different data point for each one to work on. This allowed the Solomon machine to apply a single <a href="/wiki/Algorithm" title="Algorithm">algorithm</a> to a large <a href="/wiki/Data_set" title="Data set">data set</a>, fed in the form of an array.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (July 2023)">citation needed</span></a></i>]</sup> </p><p>In 1962, Westinghouse cancelled the project, but the effort was restarted by the <a href="/wiki/University_of_Illinois_at_Urbana%E2%80%93Champaign" class="mw-redirect" title="University of Illinois at Urbana–Champaign">University of Illinois at Urbana–Champaign</a> as the <a href="/wiki/ILLIAC_IV" title="ILLIAC IV">ILLIAC IV</a>. Their version of the design originally called for a 1 <a href="/wiki/GFLOPS" class="mw-redirect" title="GFLOPS">GFLOPS</a> machine with 256 ALUs, but, when it was finally delivered in 1972, it had only 64 ALUs and could reach only 100 to 150 MFLOPS. Nevertheless, it showed that the basic concept was sound, and, when used on data-intensive applications, such as <a href="/wiki/Computational_fluid_dynamics" title="Computational fluid dynamics">computational fluid dynamics</a>, the ILLIAC was the fastest machine in the world. The ILLIAC approach of using separate ALUs for each data element is not common to later designs, and is often referred to under a separate category, <a href="/wiki/Massively_parallel" title="Massively parallel">massively parallel</a> computing. Around this time Flynn categorized this type of processing as an early form of <a href="/wiki/Single_instruction,_multiple_threads" title="Single instruction, multiple threads">single instruction, multiple threads</a> (SIMT).<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (July 2023)">citation needed</span></a></i>]</sup> </p><p><a href="/wiki/International_Computers_Limited" title="International Computers Limited">International Computers Limited</a> sought to avoid many of the difficulties with the ILLIAC concept with its own <a href="/wiki/Distributed_Array_Processor" class="mw-redirect" title="Distributed Array Processor">Distributed Array Processor</a> (DAP) design, categorising the ILLIAC and DAP as cellular array processors that potentially offered substantial performance benefits over conventional vector processor designs such as the CDC STAR-100 and Cray 1.<sup id="cite_ref-newscientist19760617_dap_1-0" class="reference"><a href="#cite_note-newscientist19760617_dap-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Computer_for_operations_with_functions">Computer for operations with functions</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=3" title="Edit section: Computer for operations with functions"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A <a href="/wiki/Computer_for_operations_with_functions" title="Computer for operations with functions">computer for operations with functions</a> was presented and developed by Kartsev in 1967.<sup id="cite_ref-Malinovsky_2-0" class="reference"><a href="#cite_note-Malinovsky-2"><span class="cite-bracket">[</span>2<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Supercomputers">Supercomputers</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=4" title="Edit section: Supercomputers"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1251242444">.mw-parser-output .ambox{border:1px solid #a2a9b1;border-left:10px solid #36c;background-color:#fbfbfb;box-sizing:border-box}.mw-parser-output .ambox+link+.ambox,.mw-parser-output .ambox+link+style+.ambox,.mw-parser-output .ambox+link+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+style+.ambox,.mw-parser-output .ambox+.mw-empty-elt+link+link+.ambox{margin-top:-1px}html body.mediawiki .mw-parser-output .ambox.mbox-small-left{margin:4px 1em 4px 0;overflow:hidden;width:238px;border-collapse:collapse;font-size:88%;line-height:1.25em}.mw-parser-output .ambox-speedy{border-left:10px solid #b32424;background-color:#fee7e6}.mw-parser-output .ambox-delete{border-left:10px solid #b32424}.mw-parser-output .ambox-content{border-left:10px solid #f28500}.mw-parser-output .ambox-style{border-left:10px solid #fc3}.mw-parser-output .ambox-move{border-left:10px solid #9932cc}.mw-parser-output .ambox-protection{border-left:10px solid #a2a9b1}.mw-parser-output .ambox .mbox-text{border:none;padding:0.25em 0.5em;width:100%}.mw-parser-output .ambox .mbox-image{border:none;padding:2px 0 2px 0.5em;text-align:center}.mw-parser-output .ambox .mbox-imageright{border:none;padding:2px 0.5em 2px 0;text-align:center}.mw-parser-output .ambox .mbox-empty-cell{border:none;padding:0;width:1px}.mw-parser-output .ambox .mbox-image-div{width:52px}@media(min-width:720px){.mw-parser-output .ambox{margin:0 10%}}@media print{body.ns-0 .mw-parser-output .ambox{display:none!important}}</style><table class="box-Unreferenced_section plainlinks metadata ambox ambox-content ambox-Unreferenced" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><span typeof="mw:File"><a href="/wiki/File:Question_book-new.svg" class="mw-file-description"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" decoding="async" width="50" height="39" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" data-file-width="512" data-file-height="399" /></a></span></div></td><td class="mbox-text"><div class="mbox-text-span">This section <b>does not <a href="/wiki/Wikipedia:Citing_sources" title="Wikipedia:Citing sources">cite</a> any <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">sources</a></b>.<span class="hide-when-compact"> Please help <a href="/wiki/Special:EditPage/Vector_processor" title="Special:EditPage/Vector processor">improve this section</a> by <a href="/wiki/Help:Referencing_for_beginners" title="Help:Referencing for beginners">adding citations to reliable sources</a>. Unsourced material may be challenged and <a href="/wiki/Wikipedia:Verifiability#Burden_of_evidence" title="Wikipedia:Verifiability">removed</a>.</span> <span class="date-container"><i>(<span class="date">July 2023</span>)</i></span><span class="hide-when-compact"><i> (<small><a href="/wiki/Help:Maintenance_template_removal" title="Help:Maintenance template removal">Learn how and when to remove this message</a></small>)</i></span></div></td></tr></tbody></table> <p>The first vector supercomputers are the <a href="/wiki/Control_Data_Corporation" title="Control Data Corporation">Control Data Corporation</a> <a href="/wiki/STAR-100" class="mw-redirect" title="STAR-100">STAR-100</a> and <a href="/wiki/Texas_Instruments" title="Texas Instruments">Texas Instruments</a> <a href="/wiki/Advanced_Scientific_Computer" class="mw-redirect" title="Advanced Scientific Computer">Advanced Scientific Computer</a> (ASC), which were introduced in 1974 and 1972, respectively. </p><p>The basic ASC (i.e., "one pipe") ALU used a pipeline architecture that supported both scalar and vector computations, with peak performance reaching approximately 20 MFLOPS, readily achieved when processing long vectors. Expanded ALU configurations supported "two pipes" or "four pipes" with a corresponding 2X or 4X performance gain. Memory bandwidth was sufficient to support these expanded modes. </p><p>The STAR-100 was otherwise slower than CDC's own supercomputers like the <a href="/wiki/CDC_7600" title="CDC 7600">CDC 7600</a>, but at data-related tasks they could keep up while being much smaller and less expensive. However the machine also took considerable time decoding the vector instructions and getting ready to run the process, so it required very specific data sets to work on before it actually sped anything up. </p><p>The vector technique was first fully exploited in 1976 by the famous <a href="/wiki/Cray-1" title="Cray-1">Cray-1</a>. Instead of leaving the data in memory like the STAR-100 and ASC, the Cray design had eight <a href="/wiki/Vector_registers" class="mw-redirect" title="Vector registers">vector registers</a>, which held sixty-four 64-bit words each. The vector instructions were applied between registers, which is much faster than talking to main memory. Whereas the STAR-100 would apply a single operation across a long vector in memory and then move on to the next operation, the Cray design would load a smaller section of the vector into registers and then apply as many operations as it could to that data, thereby avoiding many of the much slower memory access operations. </p><p>The Cray design used <a href="/wiki/Pipeline_parallelism" class="mw-redirect" title="Pipeline parallelism">pipeline parallelism</a> to implement vector instructions rather than multiple ALUs. In addition, the design had completely separate pipelines for different instructions, for example, addition/subtraction was implemented in different hardware than multiplication. This allowed a batch of vector instructions to be pipelined into each of the ALU subunits, a technique they called <a href="/wiki/Chaining_(vector_processing)" title="Chaining (vector processing)"><i>vector chaining</i></a>. The Cray-1 normally had a performance of about 80 MFLOPS, but with up to three chains running it could peak at 240 MFLOPS and averaged around 150 – far faster than any machine of the era. </p> <figure class="mw-default-size" typeof="mw:File/Thumb"><a href="/wiki/File:Cray_J90_CPU_module.jpg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Cray_J90_CPU_module.jpg/220px-Cray_J90_CPU_module.jpg" decoding="async" width="220" height="165" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Cray_J90_CPU_module.jpg/330px-Cray_J90_CPU_module.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Cray_J90_CPU_module.jpg/440px-Cray_J90_CPU_module.jpg 2x" data-file-width="720" data-file-height="540" /></a><figcaption><a href="/wiki/Cray_J90" title="Cray J90">Cray J90</a> processor module with four scalar/vector processors</figcaption></figure> <p>Other examples followed. <a href="/wiki/Control_Data_Corporation" title="Control Data Corporation">Control Data Corporation</a> tried to re-enter the high-end market again with its <a href="/wiki/ETA-10" class="mw-redirect" title="ETA-10">ETA-10</a> machine, but it sold poorly and they took that as an opportunity to leave the supercomputing field entirely. In the early and mid-1980s Japanese companies (<a href="/wiki/Fujitsu" title="Fujitsu">Fujitsu</a>, <a href="/wiki/Hitachi" title="Hitachi">Hitachi</a> and <a href="/wiki/Nippon_Electric_Corporation" class="mw-redirect" title="Nippon Electric Corporation">Nippon Electric Corporation</a> (NEC) introduced register-based vector machines similar to the Cray-1, typically being slightly faster and much smaller. <a href="/wiki/Oregon" title="Oregon">Oregon</a>-based <a href="/wiki/Floating_Point_Systems" title="Floating Point Systems">Floating Point Systems</a> (FPS) built add-on array processors for <a href="/wiki/Minicomputer" title="Minicomputer">minicomputers</a>, later building their own <a href="/wiki/Minisupercomputer" title="Minisupercomputer">minisupercomputers</a>. </p><p>Throughout, Cray continued to be the performance leader, continually beating the competition with a series of machines that led to the <a href="/wiki/Cray-2" title="Cray-2">Cray-2</a>, <a href="/wiki/Cray_X-MP" title="Cray X-MP">Cray X-MP</a> and <a href="/wiki/Cray_Y-MP" title="Cray Y-MP">Cray Y-MP</a>. Since then, the supercomputer market has focused much more on <a href="/wiki/Massively_parallel" title="Massively parallel">massively parallel</a> processing rather than better implementations of vector processors. However, recognising the benefits of vector processing, IBM developed <a href="/wiki/Virtual_Vector_Architecture" class="mw-redirect" title="Virtual Vector Architecture">Virtual Vector Architecture</a> for use in supercomputers coupling several scalar processors to act as a vector processor. </p><p>Although vector supercomputers resembling the Cray-1 are less popular these days, NEC has continued to make this type of computer up to the present day with their <a href="/wiki/NEC_SX_architecture" class="mw-redirect" title="NEC SX architecture">SX series</a> of computers. Most recently, the <a href="/wiki/SX-Aurora_TSUBASA" class="mw-redirect" title="SX-Aurora TSUBASA">SX-Aurora TSUBASA</a> places the processor and either 24 or 48 gigabytes of memory on an <a href="/wiki/High_Bandwidth_Memory" title="High Bandwidth Memory">HBM</a> 2 module within a card that physically resembles a graphics coprocessor, but instead of serving as a co-processor, it is the main computer with the PC-compatible computer into which it is plugged serving support functions. </p> <div class="mw-heading mw-heading3"><h3 id="GPU">GPU</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=5" title="Edit section: GPU"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">Main article: <a href="/wiki/Single_instruction,_multiple_threads" title="Single instruction, multiple threads">Single instruction, multiple threads</a></div> <p>Modern graphics processing units (<a href="/wiki/GPUs" class="mw-redirect" title="GPUs">GPUs</a>) include an array of <a href="/wiki/Shaders" class="mw-redirect" title="Shaders">shader pipelines</a> which may be driven by <a href="/wiki/Compute_kernel" title="Compute kernel">compute kernels</a>, and can be considered vector processors (using a similar strategy for hiding memory latencies). As shown in <a href="/wiki/Flynn%27s_taxonomy" title="Flynn's taxonomy">Flynn's 1972 paper</a> the key distinguishing factor of SIMT-based GPUs is that it has a single instruction decoder-broadcaster but that the cores receiving and executing that same instruction are otherwise reasonably normal: their own ALUs, their own register files, their own Load/Store units and their own independent L1 data caches. Thus although all cores simultaneously execute the exact same instruction in lock-step with each other they do so with completely different data from completely different memory locations. This is <i>significantly</i> more complex and involved than <a href="/wiki/Flynn%27s_Taxonomy#Pipelined_processor" class="mw-redirect" title="Flynn's Taxonomy">"Packed SIMD"</a>, which is strictly limited to execution of parallel pipelined arithmetic operations only. Although the exact internal details of today's commercial GPUs are proprietary secrets, the MIAOW<sup id="cite_ref-3" class="reference"><a href="#cite_note-3"><span class="cite-bracket">[</span>3<span class="cite-bracket">]</span></a></sup> team was able to piece together anecdotal information sufficient to implement a subset of the AMDGPU architecture.<sup id="cite_ref-4" class="reference"><a href="#cite_note-4"><span class="cite-bracket">[</span>4<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Recent_development">Recent development</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=6" title="Edit section: Recent development"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Several modern CPU architectures are being designed as vector processors. The <a href="/wiki/RISC-V#Vector_set" title="RISC-V">RISC-V vector extension</a> follows similar principles as the early vector processors, and is being implemented in commercial products such as the <a href="/wiki/Andes_Technology" title="Andes Technology">Andes Technology</a> AX45MPV.<sup id="cite_ref-5" class="reference"><a href="#cite_note-5"><span class="cite-bracket">[</span>5<span class="cite-bracket">]</span></a></sup> There are also several <a href="/wiki/Open_source" title="Open source">open source</a> vector processor architectures being developed, including <a href="/wiki/Agner_Fog#ForwardCom_instruction_set" title="Agner Fog">ForwardCom</a> and <a href="/wiki/Libre-SOC" title="Libre-SOC">Libre-SOC</a>. </p> <div class="mw-heading mw-heading2"><h2 id="Comparison_with_modern_architectures">Comparison with modern architectures</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=7" title="Edit section: Comparison with modern architectures"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>As of 2016<sup class="plainlinks noexcerpt noprint asof-tag update" style="display:none;"><a class="external text" href="https://en.wikipedia.org/w/index.php?title=Vector_processor&action=edit">[update]</a></sup> most commodity CPUs implement architectures that feature fixed-length SIMD instructions. On first inspection these can be considered a form of vector processing because they operate on multiple (vectorized, explicit length) data sets, and borrow features from vector processors. However, by definition, the addition of SIMD cannot, by itself, qualify a processor as an actual <i>vector processor</i>, because SIMD is <em>fixed-length</em>, and vectors are <em>variable-length</em>. The difference is illustrated below with examples, showing and comparing the three categories: Pure SIMD, Predicated SIMD, and Pure Vector Processing.<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="This claim needs references to reliable sources. (June 2021)">citation needed</span></a></i>]</sup> </p> <ul><li><b>Pure (fixed) SIMD</b> - also known as "Packed SIMD",<sup id="cite_ref-6" class="reference"><a href="#cite_note-6"><span class="cite-bracket">[</span>6<span class="cite-bracket">]</span></a></sup> <a href="/wiki/SIMD_within_a_register" class="mw-redirect" title="SIMD within a register">SIMD within a register</a> (SWAR), and <a href="/wiki/Flynn%27s_taxonomy#Pipelined_processor" title="Flynn's taxonomy">Pipelined Processor</a> in Flynn's Taxonomy. Common examples using SIMD with features inspired by vector processors include: Intel x86's <a href="/wiki/MMX_(instruction_set)" title="MMX (instruction set)">MMX</a>, <a href="/wiki/Streaming_SIMD_Extensions" title="Streaming SIMD Extensions">SSE</a> and <a href="/wiki/Advanced_Vector_Extensions" title="Advanced Vector Extensions">AVX</a> instructions, AMD's <a href="/wiki/3DNow!" title="3DNow!">3DNow!</a> extensions, <a href="/wiki/ARM_NEON" class="mw-redirect" title="ARM NEON">ARM NEON</a>, Sparc's <a href="/wiki/Visual_Instruction_Set" title="Visual Instruction Set">VIS</a> extension, <a href="/wiki/PowerPC" title="PowerPC">PowerPC</a>'s <a href="/wiki/AltiVec" title="AltiVec">AltiVec</a> and MIPS' <a href="/wiki/MIPS_architecture#Application-specific_extensions" title="MIPS architecture">MSA</a>. In 2000, <a href="/wiki/IBM" title="IBM">IBM</a>, <a href="/wiki/Toshiba" title="Toshiba">Toshiba</a> and <a href="/wiki/Sony" title="Sony">Sony</a> collaborated to create the <a href="/wiki/Cell_processor" class="mw-redirect" title="Cell processor">Cell processor</a>, which is also SIMD.</li> <li><b>Predicated SIMD</b> - also known as <a href="/wiki/Flynn%27s_taxonomy#Associative_processor" title="Flynn's taxonomy">associative processing</a>. Two notable examples which have per-element (lane-based) predication are <a href="/wiki/Scalable_Vector_Extension" class="mw-redirect" title="Scalable Vector Extension">ARM SVE2</a> and <a href="/wiki/AVX-512" title="AVX-512">AVX-512</a></li> <li><b>Pure Vectors</b> - as categorised in <a href="/wiki/Duncan%27s_taxonomy#Pipelined_vector_processors" title="Duncan's taxonomy">Duncan's taxonomy</a> - these include the original <a href="/wiki/Cray-1" title="Cray-1">Cray-1</a>, <a href="/wiki/Convex_Computer" title="Convex Computer">Convex C-Series</a>, <a href="/wiki/NEC_SX" title="NEC SX">NEC SX</a>, and <a href="/wiki/RISC-V#Vector_set" title="RISC-V">RISC-V RVV</a>. Although memory-based, the <a href="/wiki/CDC_STAR-100" title="CDC STAR-100">CDC STAR-100</a> was also a vector processor.</li></ul> <p>Other CPU designs include some multiple instructions for vector processing on multiple (vectorized) data sets, typically known as <a href="/wiki/MIMD" class="mw-redirect" title="MIMD">MIMD</a> (Multiple Instruction, Multiple Data) and realized with <a href="/wiki/VLIW" class="mw-redirect" title="VLIW">VLIW</a> (Very Long Instruction Word) and <a href="/wiki/Explicitly_parallel_instruction_computing" title="Explicitly parallel instruction computing">EPIC</a> (Explicitly Parallel Instruction Computing). The <a href="/wiki/Fujitsu_FR-V" class="mw-redirect" title="Fujitsu FR-V">Fujitsu FR-V</a> VLIW/vector processor combines both technologies. </p> <div class="mw-heading mw-heading3"><h3 id="Difference_between_SIMD_and_vector_processors">Difference between SIMD and vector processors</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=8" title="Edit section: Difference between SIMD and vector processors"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>SIMD instruction sets lack crucial features when compared to vector instruction sets. The most important of these is that vector processors, inherently by definition and design, have always been variable-length since their inception. </p><p>Whereas pure (fixed-width, no predication) SIMD is often mistakenly claimed to be "vector" (because SIMD processes data which happens to be vectors), through close analysis and comparison of historic and modern ISAs, actual vector ISAs may be observed to have the following features that no SIMD ISA has:<sup class="noprint Inline-Template Template-Fact" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Citation_needed" title="Wikipedia:Citation needed"><span title="See Talk:Vector processor#Discernable features (June 2021)">citation needed</span></a></i>]</sup> </p> <ul><li>a way to set the vector length, such as the <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">vsetvl</code> instruction in RISCV RVV,<sup id="cite_ref-7" class="reference"><a href="#cite_note-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup> or the <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">lvl</code> instruction in NEC SX,<sup id="cite_ref-8" class="reference"><a href="#cite_note-8"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup> without restricting the length to a <a href="/wiki/Power_of_two" title="Power of two">power of two</a> or to a multiple of a fixed data width.</li> <li>Iteration and reduction over elements <em>within</em> vectors.</li></ul> <p>Predicated SIMD (part of <a href="/wiki/Flynn%27s_taxonomy" title="Flynn's taxonomy">Flynn's taxonomy</a>) which is comprehensive individual element-level predicate masks on every vector instruction as is now available in ARM SVE2.<sup id="cite_ref-9" class="reference"><a href="#cite_note-9"><span class="cite-bracket">[</span>9<span class="cite-bracket">]</span></a></sup> And <a href="/wiki/AVX-512" title="AVX-512">AVX-512</a>, almost qualifies as a vector processor.<sup class="noprint Inline-Template" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Please_clarify" title="Wikipedia:Please clarify"><span title="Please clarify the preceding statement or statements with a good explanation from a reliable source. (December 2023)">how?</span></a></i>]</sup> Predicated SIMD uses fixed-width SIMD ALUs but allows locally controlled (predicated) activation of units to provide the appearance of variable length vectors. Examples below help explain these categorical distinctions. </p><p>SIMD, because it uses fixed-width batch processing, is <em>unable by design</em> to cope with iteration and reduction. This is illustrated further with examples, below. </p> <figure typeof="mw:File/Thumb"><a href="/wiki/File:Simd_vs_vector.png" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/4/4b/Simd_vs_vector.png/500px-Simd_vs_vector.png" decoding="async" width="500" height="216" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/4b/Simd_vs_vector.png/750px-Simd_vs_vector.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/4b/Simd_vs_vector.png/1000px-Simd_vs_vector.png 2x" data-file-width="1024" data-file-height="443" /></a><figcaption></figcaption></figure> <p>Additionally, vector processors can be more resource-efficient by using slower hardware and saving power, but still achieving throughput and having less latency than SIMD, through <a href="/wiki/Chaining_(vector_processing)" title="Chaining (vector processing)">vector chaining</a>.<sup id="cite_ref-10" class="reference"><a href="#cite_note-10"><span class="cite-bracket">[</span>10<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-11" class="reference"><a href="#cite_note-11"><span class="cite-bracket">[</span>11<span class="cite-bracket">]</span></a></sup> </p><p>Consider both a SIMD processor and a vector processor working on 4 64-bit elements, doing a LOAD, ADD, MULTIPLY and STORE sequence. If the SIMD width is 4, then the SIMD processor must LOAD four elements entirely before it can move on to the ADDs, must complete all the ADDs before it can move on to the MULTIPLYs, and likewise must complete all of the MULTIPLYs before it can start the STOREs. This is by definition and by design.<sup id="cite_ref-12" class="reference"><a href="#cite_note-12"><span class="cite-bracket">[</span>12<span class="cite-bracket">]</span></a></sup> </p><p>Having to perform 4-wide simultaneous 64-bit LOADs and 64-bit STOREs is very costly in hardware (256 bit data paths to memory). Having 4x 64-bit ALUs, especially MULTIPLY, likewise. To avoid these high costs, a SIMD processor would have to have 1-wide 64-bit LOAD, 1-wide 64-bit STORE, and only 2-wide 64-bit ALUs. As shown in the diagram, which assumes a <a href="/wiki/Superscalar_processor" title="Superscalar processor">multi-issue execution model</a>, the consequences are that the operations now take longer to complete. If multi-issue is not possible, then the operations take even longer because the LD may not be issued (started) at the same time as the first ADDs, and so on. If there are only 4-wide 64-bit SIMD ALUs, the completion time is even worse: only when all four LOADs have completed may the SIMD operations start, and only when all ALU operations have completed may the STOREs begin. </p><p>A vector processor, by contrast, even if it is <i>single-issue</i> and uses no SIMD ALUs, only having 1-wide 64-bit LOAD, 1-wide 64-bit STORE (and, as in the <a href="/wiki/Cray-1" title="Cray-1">Cray-1</a>, the ability to run MULTIPLY simultaneously with ADD), may complete the four operations faster than a SIMD processor with 1-wide LOAD, 1-wide STORE, and 2-wide SIMD. This more efficient resource utilization, due to <a href="/wiki/Chaining_(vector_processing)" title="Chaining (vector processing)">vector chaining</a>, is a key advantage and difference compared to SIMD. SIMD, by design and definition, cannot perform chaining except to the entire group of results.<sup id="cite_ref-13" class="reference"><a href="#cite_note-13"><span class="cite-bracket">[</span>13<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Description">Description</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=9" title="Edit section: Description"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>In general terms, CPUs are able to manipulate one or two pieces of data at a time. For instance, most CPUs have an instruction that essentially says "add A to B and put the result in C". The data for A, B and C could be—in theory at least—encoded directly into the instruction. However, in efficient implementation things are rarely that simple. The data is rarely sent in raw form, and is instead "pointed to" by passing in an address to a memory location that holds the data. Decoding this address and getting the data out of the memory takes some time, during which the CPU traditionally would sit idle waiting for the requested data to show up. As CPU speeds have increased, this <a href="/wiki/Memory_latency" title="Memory latency">memory latency</a> has historically become a large impediment to performance; see <a href="/wiki/Random-access_memory#Memory_wall" title="Random-access memory">Random-access memory § Memory wall</a>. </p><p>In order to reduce the amount of time consumed by these steps, most modern CPUs use a technique known as <a href="/wiki/Instruction_pipelining" title="Instruction pipelining">instruction pipelining</a> in which the instructions pass through several sub-units in turn. The first sub-unit reads the address and decodes it, the next "fetches" the values at those addresses, and the next does the math itself. With pipelining the "trick" is to start decoding the next instruction even before the first has left the CPU, in the fashion of an <a href="/wiki/Assembly_line" title="Assembly line">assembly line</a>, so the <a href="/wiki/Address_decoder" title="Address decoder">address decoder</a> is constantly in use. Any particular instruction takes the same amount of time to complete, a time known as the <i><a href="/wiki/Latency_(engineering)" title="Latency (engineering)">latency</a></i>, but the CPU can process an entire batch of operations, in an overlapping fashion, much faster and more efficiently than if it did so one at a time. </p><p>Vector processors take this concept one step further. Instead of pipelining just the instructions, they also pipeline the data itself. The processor is fed instructions that say not just to add A to B, but to add all of the numbers "from here to here" to all of the numbers "from there to there". Instead of constantly having to decode instructions and then fetch the data needed to complete them, the processor reads a single instruction from memory, and it is simply implied in the definition of the instruction <i>itself</i> that the instruction will operate again on another item of data, at an address one increment larger than the last. This allows for significant savings in decoding time. </p><p>To illustrate what a difference this can make, consider the simple task of adding two groups of 10 numbers together. In a normal programming language one would write a "loop" that picked up each of the pairs of numbers in turn, and then added them. To the CPU, this would look something like this: </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="c1">; Hypothetical RISC machine</span> <span class="c1">; assume a, b, and c are memory locations in their respective registers</span> <span class="c1">; add 10 numbers in a to 10 numbers in b, store results in c</span> <span class="w"> </span><span class="nf">move</span><span class="w"> </span><span class="no">$10</span><span class="p">,</span><span class="w"> </span><span class="no">count</span><span class="w"> </span><span class="c1">; count := 10</span> <span class="nl">loop:</span> <span class="w"> </span><span class="nf">load</span><span class="w"> </span><span class="no">r1</span><span class="p">,</span><span class="w"> </span><span class="no">a</span> <span class="w"> </span><span class="nf">load</span><span class="w"> </span><span class="no">r2</span><span class="p">,</span><span class="w"> </span><span class="no">b</span> <span class="w"> </span><span class="nf">add</span><span class="w"> </span><span class="no">r3</span><span class="p">,</span><span class="w"> </span><span class="no">r1</span><span class="p">,</span><span class="w"> </span><span class="no">r2</span><span class="w"> </span><span class="c1">; r3 := r1 + r2</span> <span class="w"> </span><span class="nf">store</span><span class="w"> </span><span class="no">r3</span><span class="p">,</span><span class="w"> </span><span class="no">c</span> <span class="w"> </span><span class="nf">add</span><span class="w"> </span><span class="no">a</span><span class="p">,</span><span class="w"> </span><span class="no">a</span><span class="p">,</span><span class="w"> </span><span class="no">$4</span><span class="w"> </span><span class="c1">; move on</span> <span class="w"> </span><span class="nf">add</span><span class="w"> </span><span class="no">b</span><span class="p">,</span><span class="w"> </span><span class="no">b</span><span class="p">,</span><span class="w"> </span><span class="no">$4</span> <span class="w"> </span><span class="nf">add</span><span class="w"> </span><span class="no">c</span><span class="p">,</span><span class="w"> </span><span class="no">c</span><span class="p">,</span><span class="w"> </span><span class="no">$4</span> <span class="w"> </span><span class="nf">dec</span><span class="w"> </span><span class="no">count</span><span class="w"> </span><span class="c1">; decrement</span> <span class="w"> </span><span class="nf">jnez</span><span class="w"> </span><span class="no">count</span><span class="p">,</span><span class="w"> </span><span class="no">loop</span><span class="w"> </span><span class="c1">; loop back if count is not yet 0</span> <span class="w"> </span><span class="nf">ret</span> </pre></div> <p>But to a vector processor, this task looks considerably different: </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="c1">; assume we have vector registers v1-v3</span> <span class="c1">; with size equal or larger than 10</span> <span class="w"> </span><span class="nf">move</span><span class="w"> </span><span class="no">$10</span><span class="p">,</span><span class="w"> </span><span class="no">count</span><span class="w"> </span><span class="c1">; count = 10</span> <span class="w"> </span><span class="nf">vload</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">a</span><span class="p">,</span><span class="w"> </span><span class="no">count</span> <span class="w"> </span><span class="nf">vload</span><span class="w"> </span><span class="no">v2</span><span class="p">,</span><span class="w"> </span><span class="no">b</span><span class="p">,</span><span class="w"> </span><span class="no">count</span> <span class="w"> </span><span class="nf">vadd</span><span class="w"> </span><span class="no">v3</span><span class="p">,</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">v2</span> <span class="w"> </span><span class="nf">vstore</span><span class="w"> </span><span class="no">v3</span><span class="p">,</span><span class="w"> </span><span class="no">c</span><span class="p">,</span><span class="w"> </span><span class="no">count</span> <span class="w"> </span><span class="nf">ret</span> </pre></div> <p>Note the complete lack of looping in the instructions, because it is the <i>hardware</i> which has performed 10 sequential operations: effectively the loop count is on an explicit <i>per-instruction</i> basis. </p><p>Cray-style vector ISAs take this a step further and provide a global "count" register, called vector length (VL): </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="c1">; again assume we have vector registers v1-v3</span> <span class="c1">; with size larger than or equal to 10</span> <span class="w"> </span><span class="nf">setvli</span><span class="w"> </span><span class="no">$10</span><span class="w"> </span><span class="c1"># Set vector length VL=10</span> <span class="w"> </span><span class="nf">vload</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">a</span><span class="w"> </span><span class="c1"># 10 loads from a</span> <span class="w"> </span><span class="nf">vload</span><span class="w"> </span><span class="no">v2</span><span class="p">,</span><span class="w"> </span><span class="no">b</span><span class="w"> </span><span class="c1"># 10 loads from b</span> <span class="w"> </span><span class="nf">vadd</span><span class="w"> </span><span class="no">v3</span><span class="p">,</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">v2</span><span class="w"> </span><span class="c1"># 10 adds</span> <span class="w"> </span><span class="nf">vstore</span><span class="w"> </span><span class="no">v3</span><span class="p">,</span><span class="w"> </span><span class="no">c</span><span class="w"> </span><span class="c1"># 10 stores into c</span> <span class="w"> </span><span class="nf">ret</span> </pre></div> <p>There are several savings inherent in this approach.<sup id="cite_ref-14" class="reference"><a href="#cite_note-14"><span class="cite-bracket">[</span>14<span class="cite-bracket">]</span></a></sup> </p> <ol><li>only three address translations are needed. Depending on the architecture, this can represent a significant savings by itself.</li> <li>Another saving is fetching and decoding the instruction itself, which has to be done only one time instead of ten.</li> <li>The code itself is also smaller, which can lead to more efficient memory use, reduction in L1 instruction cache size, reduction in power consumption.</li> <li>With the program size being reduced branch prediction has an easier job.</li> <li>With the length (equivalent to SIMD width) not being hard-coded into the instruction, not only is the encoding more compact, it's also "future-proof" and allows even <a href="/wiki/Embedded_processor" class="mw-redirect" title="Embedded processor">embedded processor</a> designs to consider using vectors purely to gain all the other advantages, rather than go for high performance.</li></ol> <p>Additionally, in more modern vector processor ISAs, "Fail on First" or "Fault First" has been introduced (see below) which brings even more advantages. </p><p>But more than that, a high performance vector processor may have multiple <a href="/wiki/Functional_unit" class="mw-redirect" title="Functional unit">functional units</a> adding those numbers in parallel. The checking of dependencies between those numbers is not required as a vector instruction specifies multiple independent operations. This simplifies the control logic required, and can further improve performance by avoiding stalls. The math operations thus completed far faster overall, the limiting factor being the time required to fetch the data from memory. </p><p>Not all problems can be attacked with this sort of solution. Including these types of instructions necessarily adds complexity to the core CPU. That complexity typically makes <i>other</i> instructions run slower—i.e., whenever it is <b>not</b> adding up many numbers in a row. The more complex instructions also add to the complexity of the decoders, which might slow down the decoding of the more common instructions such as normal adding. (<i>This can be somewhat mitigated by keeping the entire ISA to <a href="/wiki/RISC" class="mw-redirect" title="RISC">RISC</a> principles: RVV only adds around 190 vector instructions even with the advanced features.<sup id="cite_ref-15" class="reference"><a href="#cite_note-15"><span class="cite-bracket">[</span>15<span class="cite-bracket">]</span></a></sup></i>) </p><p>Vector processors were traditionally designed to work best only when there are large amounts of data to be worked on. For this reason, these sorts of CPUs were found primarily in <a href="/wiki/Supercomputer" title="Supercomputer">supercomputers</a>, as the supercomputers themselves were, in general, found in places such as weather prediction centers and physics labs, where huge amounts of data are "crunched". However, as shown above and demonstrated by RISC-V RVV the <i>efficiency</i> of vector ISAs brings other benefits which are compelling even for Embedded use-cases. </p> <div class="mw-heading mw-heading3"><h3 id="Vector_instructions">Vector instructions</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=10" title="Edit section: Vector instructions"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236090951"><div role="note" class="hatnote navigation-not-searchable">See also: <a href="/wiki/SIMD" class="mw-redirect" title="SIMD">SIMD</a> and <a href="/wiki/Single_Instruction_Multiple_Threads" class="mw-redirect" title="Single Instruction Multiple Threads">Single Instruction Multiple Threads</a></div> <p>The vector pseudocode example above comes with a big assumption that the vector computer can process more than ten numbers in one batch. For a greater quantity of numbers in the vector register, it becomes unfeasible for the computer to have a register that large. As a result, the vector processor either gains the ability to perform loops itself, or exposes some sort of vector control (status) register to the programmer, usually known as a vector Length. </p><p>The self-repeating instructions are found in early vector computers like the STAR-100, where the above action would be described in a single instruction (somewhat like <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">vadd c, a, b, $10</code>). They are also found in the <a href="/wiki/X86" title="X86">x86</a> architecture as the <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">REP</code> prefix. However, only very simple calculations can be done effectively in hardware this way without a very large cost increase. Since all operands have to be in memory for the STAR-100 architecture, the latency caused by access became huge too. </p><p>Broadcom included space in all vector operations of the <a href="/wiki/Videocore" class="mw-redirect" title="Videocore">Videocore</a> IV ISA for a <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">REP</code> field, but unlike the STAR-100 which uses memory for its repeats, the Videocore IV repeats are on all operations including arithmetic vector operations. The repeat length can be a small range of <a href="/wiki/Power_of_two" title="Power of two">power of two</a> or sourced from one of the scalar registers.<sup id="cite_ref-16" class="reference"><a href="#cite_note-16"><span class="cite-bracket">[</span>16<span class="cite-bracket">]</span></a></sup> </p><p>The <a href="/wiki/Cray-1" title="Cray-1">Cray-1</a> introduced the idea of using <a href="/wiki/Processor_register" title="Processor register">processor registers</a> to hold vector data in batches. The batch lengths (vector length, VL) could be dynamically set with a special instruction, the significance compared to Videocore IV (and, crucially as will be shown below, SIMD as well) being that the repeat length does not have to be part of the instruction encoding. This way, significantly more work can be done in each batch; the instruction encoding is much more elegant and compact as well. The only drawback is that in order to take full advantage of this extra batch processing capacity, the memory load and store speed correspondingly had to increase as well. This is sometimes claimed<sup class="noprint Inline-Template" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Manual_of_Style/Words_to_watch#Unsupported_attributions" title="Wikipedia:Manual of Style/Words to watch"><span title="The material near this tag may use weasel words or too-vague attribution. (November 2021)">by whom?</span></a></i>]</sup> to be a disadvantage of Cray-style vector processors: in reality it is part of achieving high performance throughput, as seen in <a href="/wiki/GPU" class="mw-redirect" title="GPU">GPUs</a>, which face exactly the same issue. </p><p>Modern SIMD computers claim to improve on early Cray by directly using multiple ALUs, for a higher degree of parallelism compared to only using the normal scalar pipeline. Modern vector processors (such as the <a href="/wiki/SX-Aurora_TSUBASA" class="mw-redirect" title="SX-Aurora TSUBASA">SX-Aurora TSUBASA</a>) combine both, by issuing multiple data to multiple internal pipelined SIMD ALUs, the number issued being dynamically chosen by the vector program at runtime. Masks can be used to selectively load and store data in memory locations, and use those same masks to selectively disable processing element of SIMD ALUs. Some processors with SIMD (<a href="/wiki/AVX-512" title="AVX-512">AVX-512</a>, ARM <a href="/wiki/Scalable_Vector_Extension" class="mw-redirect" title="Scalable Vector Extension">SVE2</a>) are capable of this kind of selective, per-element (<a href="/wiki/Predication_(computer_architecture)" title="Predication (computer architecture)">"predicated"</a>) processing, and it is these which somewhat deserve the nomenclature "vector processor" or at least deserve the claim of being capable of "vector processing". SIMD processors without per-element predication (<a href="/wiki/MMX_(instruction_set)" title="MMX (instruction set)">MMX</a>, <a href="/wiki/Streaming_SIMD_Extensions" title="Streaming SIMD Extensions">SSE</a>, <a href="/wiki/AltiVec" title="AltiVec">AltiVec</a>) categorically do not. </p><p>Modern GPUs, which have many small compute units each with their own independent SIMD ALUs, use <a href="/wiki/Single_Instruction_Multiple_Threads" class="mw-redirect" title="Single Instruction Multiple Threads">Single Instruction Multiple Threads</a> (SIMT). SIMT units run from a shared single broadcast synchronised Instruction Unit. The "vector registers" are very wide and the pipelines tend to be long. The "threading" part of SIMT involves the way data is handled independently on each of the compute units. </p><p>In addition, GPUs such as the Broadcom <a href="/wiki/Videocore" class="mw-redirect" title="Videocore">Videocore</a> IV and other external vector processors like the <a href="/wiki/NEC_SX-Aurora_TSUBASA" title="NEC SX-Aurora TSUBASA">NEC SX-Aurora TSUBASA</a> may use fewer vector units than the width implies: instead of having 64 units for a 64-number-wide register, the hardware might instead do a pipelined loop over 16 units for a hybrid approach. The Broadcom <a href="/wiki/Videocore" class="mw-redirect" title="Videocore">Videocore</a> IV is also capable of this hybrid approach: nominally stating that its SIMD QPU Engine supports 16-long FP array operations in its instructions, it actually does them 4 at a time, as (another) form of "threads".<sup id="cite_ref-17" class="reference"><a href="#cite_note-17"><span class="cite-bracket">[</span>17<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading3"><h3 id="Vector_instruction_example">Vector instruction example</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=11" title="Edit section: Vector instruction example"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>This example starts with an algorithm ("IAXPY"), first show it in scalar instructions, then SIMD, then predicated SIMD, and finally vector instructions. This incrementally helps illustrate the difference between a traditional vector processor and a modern SIMD one. The example starts with a 32-bit integer variant of the "DAXPY" function, in <a href="/wiki/C_(programming_language)" title="C (programming language)">C</a>: </p> <div class="mw-highlight mw-highlight-lang-c mw-content-ltr" dir="ltr"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="nf">iaxpy</span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">n</span><span class="p">,</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">a</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">x</span><span class="p">[],</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">y</span><span class="p">[])</span><span class="w"> </span><span class="p">{</span> <span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o"><</span><span class="w"> </span><span class="n">n</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="o">++</span><span class="p">)</span> <span class="w"> </span><span class="n">y</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">a</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">x</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">y</span><span class="p">[</span><span class="n">i</span><span class="p">];</span> <span class="p">}</span> </pre></div> <p>In each iteration, every element of y has an element of x multiplied by a and added to it. The program is expressed in scalar linear form for readability. </p> <div class="mw-heading mw-heading4"><h4 id="Scalar_assembler">Scalar assembler</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=12" title="Edit section: Scalar assembler"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The scalar version of this would load one of each of x and y, process one calculation, store one result, and loop: </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="nl">loop:</span> <span class="w"> </span><span class="nf">load32</span><span class="w"> </span><span class="no">r1</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="w"> </span><span class="c1">; load one 32bit data</span> <span class="w"> </span><span class="nf">load32</span><span class="w"> </span><span class="no">r2</span><span class="p">,</span><span class="w"> </span><span class="no">y</span> <span class="w"> </span><span class="nf">mul32</span><span class="w"> </span><span class="no">r1</span><span class="p">,</span><span class="w"> </span><span class="no">a</span><span class="p">,</span><span class="w"> </span><span class="no">r1</span><span class="w"> </span><span class="c1">; r1 := r1 * a</span> <span class="w"> </span><span class="nf">add32</span><span class="w"> </span><span class="no">r3</span><span class="p">,</span><span class="w"> </span><span class="no">r1</span><span class="p">,</span><span class="w"> </span><span class="no">r2</span><span class="w"> </span><span class="c1">; r3 := r1 + r2</span> <span class="w"> </span><span class="nf">store32</span><span class="w"> </span><span class="no">r3</span><span class="p">,</span><span class="w"> </span><span class="no">y</span> <span class="w"> </span><span class="nf">addl</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">$4</span><span class="w"> </span><span class="c1">; x := x + 4</span> <span class="w"> </span><span class="nf">addl</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">$4</span> <span class="w"> </span><span class="nf">subl</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">$1</span><span class="w"> </span><span class="c1">; n := n - 1</span> <span class="w"> </span><span class="nf">jgz</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">loop</span><span class="w"> </span><span class="c1">; loop back if n > 0</span> <span class="nl">out:</span> <span class="w"> </span><span class="nf">ret</span> </pre></div> <p>The STAR-like code remains concise, but because the STAR-100's vectorisation was by design based around memory accesses, an extra slot of memory is now required to process the information. Two times the latency is also needed due to the extra requirement of memory access. </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="w"> </span><span class="c1">; Assume tmp is pre-allocated</span> <span class="w"> </span><span class="nf">vmul</span><span class="w"> </span><span class="no">tmp</span><span class="p">,</span><span class="w"> </span><span class="no">a</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="w"> </span><span class="c1">; tmp[i] = a * x[i]</span> <span class="w"> </span><span class="nf">vadd</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">tmp</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="w"> </span><span class="c1">; y[i] = y[i] + tmp[i]</span> <span class="w"> </span><span class="nf">ret</span> </pre></div> <div class="mw-heading mw-heading4"><h4 id="Pure_(non-predicated,_packed)_SIMD"><span id="Pure_.28non-predicated.2C_packed.29_SIMD"></span>Pure (non-predicated, packed) SIMD</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=13" title="Edit section: Pure (non-predicated, packed) SIMD"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>A modern packed SIMD architecture, known by many names (listed in <a href="/wiki/Flynn%27s_taxonomy#Pipelined_processor" title="Flynn's taxonomy">Flynn's taxonomy</a>), can do most of the operation in batches. The code is mostly similar to the scalar version. It is assumed that both x and y are <a href="/wiki/Data_structure_alignment" title="Data structure alignment">properly aligned</a> here (only start on a multiple of 16) and that n is a multiple of 4, as otherwise some setup code would be needed to calculate a mask or to run a scalar version. It can also be assumed, for simplicity, that the SIMD instructions have an option to automatically repeat scalar operands, like ARM NEON can.<sup id="cite_ref-18" class="reference"><a href="#cite_note-18"><span class="cite-bracket">[</span>18<span class="cite-bracket">]</span></a></sup> If it does not, a "splat" (broadcast) must be used, to copy the scalar argument across a SIMD register: </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="w"> </span><span class="nf">splatx4</span><span class="w"> </span><span class="no">v4</span><span class="p">,</span><span class="w"> </span><span class="no">a</span><span class="w"> </span><span class="c1">; v4 = a,a,a,a</span> </pre></div> <p>The time taken would be basically the same as a vector implementation of <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">y = mx + c</code> described above. </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="nl">vloop:</span> <span class="w"> </span><span class="nf">load32x4</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">x</span> <span class="w"> </span><span class="nf">load32x4</span><span class="w"> </span><span class="no">v2</span><span class="p">,</span><span class="w"> </span><span class="no">y</span> <span class="w"> </span><span class="nf">mul32x4</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">a</span><span class="p">,</span><span class="w"> </span><span class="no">v1</span><span class="w"> </span><span class="c1">; v1 := v1 * a</span> <span class="w"> </span><span class="nf">add32x4</span><span class="w"> </span><span class="no">v3</span><span class="p">,</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">v2</span><span class="w"> </span><span class="c1">; v3 := v1 + v2</span> <span class="w"> </span><span class="nf">store32x4</span><span class="w"> </span><span class="no">v3</span><span class="p">,</span><span class="w"> </span><span class="no">y</span> <span class="w"> </span><span class="nf">addl</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">$16</span><span class="w"> </span><span class="c1">; x := x + 16</span> <span class="w"> </span><span class="nf">addl</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">$16</span> <span class="w"> </span><span class="nf">subl</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">$4</span><span class="w"> </span><span class="c1">; n := n - 4</span> <span class="w"> </span><span class="nf">jgz</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">vloop</span><span class="w"> </span><span class="c1">; go back if n > 0</span> <span class="nl">out:</span> <span class="w"> </span><span class="nf">ret</span> </pre></div> <p>Note that both x and y pointers are incremented by 16, because that is how long (in bytes) four 32-bit integers are. The decision was made that the algorithm <i>shall</i> only cope with 4-wide SIMD, therefore the constant is hard-coded into the program. </p><p>Unfortunately for SIMD, the clue was in the assumption above, "that n is a multiple of 4" as well as "aligned access", which, clearly, is a limited specialist use-case. </p><p>Realistically, for general-purpose loops such as in portable libraries, where n cannot be limited in this way, the overhead of setup and cleanup for SIMD in order to cope with non-multiples of the SIMD width, can far exceed the instruction count inside the loop itself. Assuming worst-case that the hardware cannot do misaligned SIMD memory accesses, a real-world algorithm will: </p> <ul><li>first have to have a preparatory section which works on the beginning unaligned data, up to the first point where SIMD memory-aligned operations can take over. this will either involve (slower) scalar-only operations or smaller-sized packed SIMD operations. Each copy implements the full algorithm inner loop.</li> <li>perform the aligned SIMD loop at the maximum SIMD width up until the last few elements (those remaining that do not fit the fixed SIMD width)</li> <li>have a cleanup phase which, like the preparatory section, is just as large and just as complex.</li></ul> <p>Eight-wide SIMD requires repeating the inner loop algorithm first with four-wide SIMD elements, then two-wide SIMD, then one (scalar), with a test and branch in between each one, in order to cover the first and last remaining SIMD elements (0 <= n <= 7). </p><p>This more than <i>triples</i> the size of the code, in fact in extreme cases it results in an <i>order of magnitude</i> increase in instruction count! This can easily be demonstrated by compiling the iaxpy example for <a href="/wiki/AVX-512" title="AVX-512">AVX-512</a>, using the options <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">"-O3 -march=knl"</code> to <a href="/wiki/GNU_Compiler_Collection" title="GNU Compiler Collection">gcc</a>. </p><p>Over time as the ISA evolves to keep increasing performance, it results in ISA Architects adding 2-wide SIMD, then 4-wide SIMD, then 8-wide and upwards. It can therefore be seen why <a href="/wiki/AVX-512" title="AVX-512">AVX-512</a> exists in x86. </p><p>Without predication, the wider the SIMD width the worse the problems get, leading to massive opcode proliferation, degraded performance, extra power consumption and unnecessary software complexity.<sup id="cite_ref-19" class="reference"><a href="#cite_note-19"><span class="cite-bracket">[</span>19<span class="cite-bracket">]</span></a></sup> </p><p>Vector processors on the other hand are designed to issue computations of variable length for an arbitrary count, n, and thus require very little setup, and no cleanup. Even compared to those SIMD ISAs which have masks (but no <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">setvl</code> instruction), Vector processors produce much more compact code because they do not need to perform explicit mask calculation to cover the last few elements (illustrated below). </p> <div class="mw-heading mw-heading4"><h4 id="Predicated_SIMD">Predicated SIMD</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=14" title="Edit section: Predicated SIMD"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Assuming a hypothetical predicated (mask capable) SIMD ISA, and again assuming that the SIMD instructions can cope with misaligned data, the instruction loop would look like this: </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="nl">vloop:</span> <span class="w"> </span><span class="c1"># prepare mask. few ISAs have min though</span> <span class="w"> </span><span class="nf">min</span><span class="w"> </span><span class="no">t0</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">$4</span><span class="w"> </span><span class="c1">; t0 = min(n, 4)</span> <span class="w"> </span><span class="nf">shift</span><span class="w"> </span><span class="no">m</span><span class="p">,</span><span class="w"> </span><span class="no">$1</span><span class="p">,</span><span class="w"> </span><span class="no">t0</span><span class="w"> </span><span class="c1">; m = 1<<t0</span> <span class="w"> </span><span class="nf">sub</span><span class="w"> </span><span class="no">m</span><span class="p">,</span><span class="w"> </span><span class="no">m</span><span class="p">,</span><span class="w"> </span><span class="no">$1</span><span class="w"> </span><span class="c1">; m = (1<<t0)-1</span> <span class="w"> </span><span class="c1"># now do the operation, masked by m bits</span> <span class="w"> </span><span class="nf">load32x4</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">m</span> <span class="w"> </span><span class="nf">load32x4</span><span class="w"> </span><span class="no">v2</span><span class="p">,</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">m</span> <span class="w"> </span><span class="nf">mul32x4</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">a</span><span class="p">,</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">m</span><span class="w"> </span><span class="c1">; v1 := v1 * a</span> <span class="w"> </span><span class="nf">add32x4</span><span class="w"> </span><span class="no">v3</span><span class="p">,</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">v2</span><span class="p">,</span><span class="w"> </span><span class="no">m</span><span class="w"> </span><span class="c1">; v3 := v1 + v2</span> <span class="w"> </span><span class="nf">store32x4</span><span class="w"> </span><span class="no">v3</span><span class="p">,</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">m</span> <span class="w"> </span><span class="c1"># update x, y and n for next loop</span> <span class="w"> </span><span class="nf">addl</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">t0</span><span class="p">*</span><span class="mi">4</span><span class="w"> </span><span class="c1">; x := x + t0*4</span> <span class="w"> </span><span class="nf">addl</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">t0</span><span class="p">*</span><span class="mi">4</span> <span class="w"> </span><span class="nf">subl</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">t0</span><span class="w"> </span><span class="c1">; n := n - t0</span> <span class="w"> </span><span class="c1"># loop?</span> <span class="w"> </span><span class="nf">jgz</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">vloop</span><span class="w"> </span><span class="c1">; go back if n > 0</span> <span class="nl">out:</span> <span class="w"> </span><span class="nf">ret</span> </pre></div> <p>Here it can be seen that the code is much cleaner but a little complex: at least, however, there is no setup or cleanup: on the last iteration of the loop, the predicate mask wil be set to either 0b0000, 0b0001, 0b0011, 0b0111 or 0b1111, resulting in between 0 and 4 SIMD element operations being performed, respectively. One additional potential complication: some RISC ISAs do not have a "min" instruction, needing instead to use a branch or scalar predicated compare. </p><p>It is clear how predicated SIMD at least merits the term "vector capable", because it can cope with variable-length vectors by using predicate masks. The final evolving step to a "true" vector ISA, however, is to not have any evidence in the ISA <i>at all</i> of a SIMD width, leaving that entirely up to the hardware. </p> <div class="mw-heading mw-heading4"><h4 id="Pure_(true)_vector_ISA"><span id="Pure_.28true.29_vector_ISA"></span>Pure (true) vector ISA</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=15" title="Edit section: Pure (true) vector ISA"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>For Cray-style vector ISAs such as RVV, an instruction called "setvl" (set vector length) is used. The hardware first defines how many data values it can process in one "vector": this could be either actual registers or it could be an internal loop (the hybrid approach, mentioned above). This maximum amount (the number of hardware "lanes") is termed "MVL" (Maximum Vector Length). Note that, as seen in SX-Aurora and Videocore IV, MVL may be an actual hardware lane quantity <i>or a virtual one</i>. <i>(Note: As mentioned in the ARM SVE2 Tutorial, programmers <b>must</b> not make the mistake of assuming a fixed vector width: consequently MVL is not a quantity that the programmer needs to know. This can be a little disconcerting after years of SIMD mindset).</i><sup class="noprint Inline-Template" style="white-space:nowrap;">[<i><a href="/wiki/Wikipedia:Writing_better_articles#Tone" title="Wikipedia:Writing better articles"><span title="The tone or style of material in the vicinity of this tag may not be appropriate for Wikipedia. (November 2021)">tone</span></a></i>]</sup> </p><p>On calling setvl with the number of outstanding data elements to be processed, "setvl" is permitted (essentially required) to limit that to the Maximum Vector Length (MVL) and thus returns the <i>actual</i> number that can be processed by the hardware in subsequent vector instructions, and sets the internal special register, "VL", to that same amount. ARM refers to this technique as "vector length agnostic" programming in its tutorials on SVE2.<sup id="cite_ref-20" class="reference"><a href="#cite_note-20"><span class="cite-bracket">[</span>20<span class="cite-bracket">]</span></a></sup> </p><p>Below is the Cray-style vector assembler for the same SIMD style loop, above. Note that t0 (which, containing a convenient copy of VL, can vary) is used instead of hard-coded constants: </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="nl">vloop:</span> <span class="w"> </span><span class="nf">setvl</span><span class="w"> </span><span class="no">t0</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="w"> </span><span class="c1"># VL=t0=min(MVL, n)</span> <span class="w"> </span><span class="nf">vld32</span><span class="w"> </span><span class="no">v0</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="w"> </span><span class="c1"># load vector x</span> <span class="w"> </span><span class="nf">vld32</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">y</span><span class="w"> </span><span class="c1"># load vector y</span> <span class="w"> </span><span class="nf">vmadd32</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">v0</span><span class="p">,</span><span class="w"> </span><span class="no">a</span><span class="w"> </span><span class="c1"># v1 += v0 * a</span> <span class="w"> </span><span class="nf">vst32</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">y</span><span class="w"> </span><span class="c1"># store Y</span> <span class="w"> </span><span class="nf">add</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">t0</span><span class="p">*</span><span class="mi">4</span><span class="w"> </span><span class="c1"># advance y by VL*4</span> <span class="w"> </span><span class="nf">add</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">t0</span><span class="p">*</span><span class="mi">4</span><span class="w"> </span><span class="c1"># advance x by VL*4</span> <span class="w"> </span><span class="nf">sub</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">t0</span><span class="w"> </span><span class="c1"># n -= VL (t0)</span> <span class="w"> </span><span class="nf">bnez</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">vloop</span><span class="w"> </span><span class="c1"># repeat if n != 0</span> </pre></div> <p>This is essentially not very different from the SIMD version (processes 4 data elements per loop), or from the initial Scalar version (processes just the one). n still contains the number of data elements remaining to be processed, but t0 contains the copy of VL – the number that is <i>going</i> to be processed in each iteration. t0 is subtracted from n after each iteration, and if n is zero then all elements have been processed. </p><p>A number of things to note, when comparing against the Predicated SIMD assembly variant: </p> <ol><li>The <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">setvl</code> instruction has embedded within it a <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">min</code> instruction</li> <li>Where the SIMD variant hard-coded both the width (4) into the creation of the mask <i>and</i> in the SIMD width (load32x4 etc.) the vector ISA equivalents have no such limit. This makes vector programs both portable, Vendor Independent, and future-proof.</li> <li>Setting VL effectively <i>creates a hidden predicate mask</i> that is automatically applied to the vectors</li> <li>Where with predicated SIMD the mask bitlength is limited to that which may be held in a scalar (or special mask) register, vector ISA's mask registers have no such limitation. Cray-I vectors could be just over 1,000 elements (in 1977).</li></ol> <p>Thus it can be seen, very clearly, how vector ISAs reduce the number of instructions. </p><p>Also note, that just like the predicated SIMD variant, the pointers to x and y are advanced by t0 times four because they both point to 32 bit data, but that n is decremented by straight t0. Compared to the fixed-size SIMD assembler there is very little apparent difference: x and y are advanced by hard-coded constant 16, n is decremented by a hard-coded 4, so initially it is hard to appreciate the significance. The difference comes in the realisation that the vector hardware could be capable of doing 4 simultaneous operations, or 64, or 10,000, it would be the exact same vector assembler for all of them <i>and there would still be no SIMD cleanup code</i>. Even compared to the predicate-capable SIMD, it is still more compact, clearer, more elegant and uses less resources. </p><p>Not only is it a much more compact program (saving on L1 Cache size), but as previously mentioned, the vector version can issue far more data processing to the ALUs, again saving power because Instruction Decode and Issue can sit idle. </p><p>Additionally, the number of elements going in to the function can start at zero. This sets the vector length to zero, which effectively disables all vector instructions, turning them into <a href="/wiki/No-op" class="mw-redirect" title="No-op">no-ops</a>, at runtime. Thus, unlike non-predicated SIMD, even when there are no elements to process there is still no wasted cleanup code. </p> <div class="mw-heading mw-heading3"><h3 id="Vector_reduction_example">Vector reduction example</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=16" title="Edit section: Vector reduction example"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>This example starts with an algorithm which involves reduction. Just as with the previous example, it will be first shown in scalar instructions, then SIMD, and finally vector instructions, starting in <a href="/wiki/C_(programming_language)" title="C (programming language)">c</a>: </p> <div class="mw-highlight mw-highlight-lang-c mw-content-ltr" dir="ltr"><pre><span></span><span class="kt">void</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">n</span><span class="p">,</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">a</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">x</span><span class="p">[])</span><span class="w"> </span><span class="p">{</span> <span class="w"> </span><span class="kt">int</span><span class="w"> </span><span class="n">y</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span> <span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="kt">size_t</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="o"><</span><span class="w"> </span><span class="n">n</span><span class="p">;</span><span class="w"> </span><span class="n">i</span><span class="o">++</span><span class="p">)</span> <span class="w"> </span><span class="n">y</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="n">x</span><span class="p">[</span><span class="n">i</span><span class="p">];</span> <span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="n">y</span><span class="p">;</span> <span class="p">}</span> </pre></div> <p>Here, an accumulator (y) is used to sum up all the values in the array, x. </p> <div class="mw-heading mw-heading4"><h4 id="Scalar_assembler_2">Scalar assembler</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=17" title="Edit section: Scalar assembler"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>The scalar version of this would load each of x, add it to y, and loop: </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="w"> </span><span class="nf">set</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="w"> </span><span class="c1">; y initialised to zero</span> <span class="nl">loop:</span> <span class="w"> </span><span class="nf">load32</span><span class="w"> </span><span class="no">r1</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="w"> </span><span class="c1">; load one 32bit data</span> <span class="w"> </span><span class="nf">add32</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">r1</span><span class="w"> </span><span class="c1">; y := y + r1</span> <span class="w"> </span><span class="nf">addl</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">$4</span><span class="w"> </span><span class="c1">; x := x + 4</span> <span class="w"> </span><span class="nf">subl</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">$1</span><span class="w"> </span><span class="c1">; n := n - 1</span> <span class="w"> </span><span class="nf">jgz</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">loop</span><span class="w"> </span><span class="c1">; loop back if n > 0</span> <span class="nl">out:</span> <span class="w"> </span><span class="nf">ret</span><span class="w"> </span><span class="no">y</span><span class="w"> </span><span class="c1">; returns result, y</span> </pre></div> <p>This is very straightforward. "y" starts at zero, 32 bit integers are loaded one at a time into r1, added to y, and the address of the array "x" moved on to the next element in the array. </p> <div class="mw-heading mw-heading4"><h4 id="SIMD_reduction">SIMD reduction</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=18" title="Edit section: SIMD reduction"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>This is where the problems start. SIMD by design is incapable of doing arithmetic operations "inter-element". Element 0 of one SIMD register may be added to Element 0 of another register, but Element 0 may <b>not</b> be added to anything <b>other</b> than another Element 0. This places some severe limitations on potential implementations. For simplicity it can be assumed that n is exactly 8: </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="w"> </span><span class="nf">addl</span><span class="w"> </span><span class="no">r3</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">$16</span><span class="w"> </span><span class="c1">; for 2nd 4 of x</span> <span class="w"> </span><span class="nf">load32x4</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="w"> </span><span class="c1">; first 4 of x</span> <span class="w"> </span><span class="nf">load32x4</span><span class="w"> </span><span class="no">v2</span><span class="p">,</span><span class="w"> </span><span class="no">r3</span><span class="w"> </span><span class="c1">; 2nd 4 of x</span> <span class="w"> </span><span class="nf">add32x4</span><span class="w"> </span><span class="no">v1</span><span class="p">,</span><span class="w"> </span><span class="no">v2</span><span class="p">,</span><span class="w"> </span><span class="no">v1</span><span class="w"> </span><span class="c1">; add 2 groups</span> </pre></div> <p>At this point four adds have been performed: </p> <ul><li><code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">x[0]+x[4]</code> - First SIMD ADD: element 0 of first group added to element 0 of second group</li> <li><code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">x[1]+x[5]</code> - Second SIMD ADD: element 1 of first group added to element 1 of second group</li> <li><code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">x[2]+x[6]</code> - Third SIMD ADD: element 2 of first group added to element 2 of second group</li> <li><code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">x[3]+x[7]</code> - Fourth SIMD ADD: element 3 of first group added to element 2 of second group</li></ul> <p>but with 4-wide SIMD being incapable <b>by design</b> of adding <code class="mw-highlight mw-highlight-lang-text mw-content-ltr" style="" dir="ltr">x[0]+x[1]</code> for example, things go rapidly downhill just as they did with the general case of using SIMD for general-purpose IAXPY loops. To sum the four partial results, two-wide SIMD can be used, followed by a single scalar add, to finally produce the answer, but, frequently, the data must be transferred out of dedicated SIMD registers before the last scalar computation can be performed. </p><p>Even with a general loop (n not fixed), the only way to use 4-wide SIMD is to assume four separate "streams", each offset by four elements. Finally, the four partial results have to be summed. Other techniques involve shuffle: examples online can be found for <a href="/wiki/AVX-512" title="AVX-512">AVX-512</a> of how to do "Horizontal Sum"<sup id="cite_ref-21" class="reference"><a href="#cite_note-21"><span class="cite-bracket">[</span>21<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-22" class="reference"><a href="#cite_note-22"><span class="cite-bracket">[</span>22<span class="cite-bracket">]</span></a></sup> </p><p>Aside from the size of the program and the complexity, an additional potential problem arises if floating-point computation is involved: the fact that the values are not being summed in strict order (four partial results) could result in rounding errors. </p> <div class="mw-heading mw-heading4"><h4 id="Vector_ISA_reduction">Vector ISA reduction</h4><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=19" title="Edit section: Vector ISA reduction"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Vector instruction sets have arithmetic reduction operations <i>built-in</i> to the ISA. If it is assumed that n is less or equal to the maximum vector length, only three instructions are required: </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="w"> </span><span class="nf">setvl</span><span class="w"> </span><span class="no">t0</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="w"> </span><span class="c1"># VL=t0=min(MVL, n)</span> <span class="w"> </span><span class="nf">vld32</span><span class="w"> </span><span class="no">v0</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="w"> </span><span class="c1"># load vector x</span> <span class="w"> </span><span class="nf">vredadd32</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">v0</span><span class="w"> </span><span class="c1"># reduce-add into y</span> </pre></div> <p>The code when n is larger than the maximum vector length is not that much more complex, and is a similar pattern to the first example ("IAXPY"). </p> <div class="mw-highlight mw-highlight-lang-gas mw-content-ltr" dir="ltr"><pre><span></span><span class="w"> </span><span class="nf">set</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span> <span class="nl">vloop:</span> <span class="w"> </span><span class="nf">setvl</span><span class="w"> </span><span class="no">t0</span><span class="p">,</span><span class="w"> </span><span class="no">n</span><span class="w"> </span><span class="c1"># VL=t0=min(MVL, n)</span> <span class="w"> </span><span class="nf">vld32</span><span class="w"> </span><span class="no">v0</span><span class="p">,</span><span class="w"> </span><span class="no">x</span><span class="w"> </span><span class="c1"># load vector x</span> <span class="w"> </span><span class="nf">vredadd32</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">y</span><span class="p">,</span><span class="w"> </span><span class="no">v0</span><span class="w"> </span><span class="c1"># add all x into y</span> <span class="w"> </span><span class="nf">add</span><span class="w"> </span><span class="no">x</span><span class="p">,</span><span class="w"> </span><span class="no">t0</span><span class="p">*</span><span class="mi">4</span><span class="w"> </span><span class="c1"># advance x by VL*4</span> <span class="w"> </span><span class="nf">sub</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">t0</span><span class="w"> </span><span class="c1"># n -= VL (t0)</span> <span class="w"> </span><span class="nf">bnez</span><span class="w"> </span><span class="no">n</span><span class="p">,</span><span class="w"> </span><span class="no">vloop</span><span class="w"> </span><span class="c1"># repeat if n != 0</span> <span class="w"> </span><span class="nf">ret</span><span class="w"> </span><span class="no">y</span> </pre></div> <p>The simplicity of the algorithm is stark in comparison to SIMD. Again, just as with the IAXPY example, the algorithm is length-agnostic (even on Embedded implementations where maximum vector length could be only one). </p><p>Implementations in hardware may, if they are certain that the right answer will be produced, perform the reduction in parallel. Some vector ISAs offer a parallel reduction mode as an explicit option, for when the programmer knows that any potential rounding errors do not matter, and low latency is critical.<sup id="cite_ref-23" class="reference"><a href="#cite_note-23"><span class="cite-bracket">[</span>23<span class="cite-bracket">]</span></a></sup> </p><p>This example again highlights a key critical fundamental difference between true vector processors and those SIMD processors, including most commercial GPUs, which are inspired by features of vector processors. </p> <div class="mw-heading mw-heading3"><h3 id="Insights_from_examples">Insights from examples</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=20" title="Edit section: Insights from examples"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Compared to any SIMD processor claiming to be a vector processor, the order of magnitude reduction in program size is almost shocking. However, this level of elegance at the ISA level has quite a high price tag at the hardware level: </p> <ol><li>From the IAXPY example, it can be seen that unlike SIMD processors, which can simplify their internal hardware by avoiding dealing with misaligned memory access, a vector processor cannot get away with such simplification: algorithms are written which inherently rely on Vector Load and Store being successful, regardless of alignment of the start of the vector.</li> <li>Whilst from the reduction example it can be seen that, aside from <a href="/wiki/Permute_instruction" title="Permute instruction">permute instructions</a>, SIMD by definition avoids inter-lane operations entirely (element 0 can only be added to another element 0), vector processors tackle this head-on. What programmers are forced to do in software (using shuffle and other tricks, to swap data into the right "lane") vector processors must do in hardware, automatically.</li></ol> <p>Overall then there is a choice to either have </p> <ol><li>complex software and simplified hardware (SIMD)</li> <li>simplified software and complex hardware (vector processors)</li></ol> <p>These stark differences are what distinguishes a vector processor from one that has SIMD. </p> <div class="mw-heading mw-heading2"><h2 id="Vector_processor_features">Vector processor features</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=21" title="Edit section: Vector processor features"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Where many SIMD ISAs borrow or are inspired by the list below, typical features that a vector processor will have are:<sup id="cite_ref-24" class="reference"><a href="#cite_note-24"><span class="cite-bracket">[</span>24<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-25" class="reference"><a href="#cite_note-25"><span class="cite-bracket">[</span>25<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-26" class="reference"><a href="#cite_note-26"><span class="cite-bracket">[</span>26<span class="cite-bracket">]</span></a></sup> </p> <ul><li><b>Vector Load and Store</b> – Vector architectures with a register-to-register design (analogous to load–store architectures for scalar processors) have instructions for transferring multiple elements between the memory and the vector registers. Typically, multiple addressing modes are supported. The unit-stride addressing mode is essential; modern vector architectures typically also support arbitrary constant strides, as well as the scatter/gather (also called <i>indexed</i>) addressing mode. Advanced architectures may also include support for <i>segment</i> load and stores, and <i>fail-first</i> variants of the standard vector load and stores. Segment loads read a vector from memory, where each element is a <a href="/wiki/Data_structure" title="Data structure">data structure</a> containing multiple members. The members are extracted from data structure (element), and each extracted member is placed into a different vector register.</li> <li><b>Masked Operations</b> – <a href="/wiki/Predication_(computer_architecture)" title="Predication (computer architecture)">predicate masks</a> allow parallel if/then/else constructs without resorting to branches. This allows code with conditional statements to be vectorized.</li> <li><b>Compress and Expand</b> – usually using a bit-mask, data is linearly compressed or expanded (redistributed) based on whether bits in the mask are set or clear, whilst always preserving the sequential order and never duplicating values (unlike Gather-Scatter aka permute). These instructions feature in <a href="/wiki/AVX-512#Compress_and_expand" title="AVX-512">AVX-512</a>.</li> <li><b>Register Gather, Scatter (aka permute)</b><sup id="cite_ref-27" class="reference"><a href="#cite_note-27"><span class="cite-bracket">[</span>27<span class="cite-bracket">]</span></a></sup> – a less restrictive more generic variation of the compress/expand theme which instead takes one vector to specify the indices to use to "reorder" another vector. Gather/scatter is more complex to implement than compress/expand, and, being inherently non-sequential, can interfere with <a href="/wiki/Chaining_(vector_processing)" title="Chaining (vector processing)">vector chaining</a>. Not to be confused with <a href="/wiki/Gather-scatter" class="mw-redirect" title="Gather-scatter">Gather-scatter</a> Memory Load/Store modes, Gather/scatter vector operations act on the vector registers, and are often termed a <a href="/wiki/Permute_instruction" title="Permute instruction">permute instruction</a> instead.</li> <li><b>Splat and Extract</b> – useful for interaction between scalar and vector, these broadcast a single value across a vector, or extract one item from a vector, respectively.</li> <li><b>Iota</b> – a very simple and strategically useful instruction which drops sequentially-incrementing immediates into successive elements. Usually starts from zero.</li> <li><b>Reduction and <a href="/wiki/Iteration#Computing" title="Iteration">Iteration</a></b> – operations that perform <a href="/wiki/Mapreduce" class="mw-redirect" title="Mapreduce">mapreduce</a> on a vector (for example, find the one maximum value of an entire vector, or sum all elements). Iteration is of the form <code>x[i] = y[i] + x[i-1]</code> where Reduction is of the form <code>x = y[0] + y[1]… + y[n-1]</code></li> <li><b>Matrix Multiply support</b> – either by way of algorithmically loading data from memory, or reordering (remapping) the normally linear access to vector elements, or providing "Accumulators", arbitrary-sized matrices may be efficiently processed. IBM POWER10 provides MMA instructions<sup id="cite_ref-28" class="reference"><a href="#cite_note-28"><span class="cite-bracket">[</span>28<span class="cite-bracket">]</span></a></sup> although for arbitrary Matrix widths that do not fit the exact SIMD size data repetition techniques are needed which is wasteful of register file resources.<sup id="cite_ref-29" class="reference"><a href="#cite_note-29"><span class="cite-bracket">[</span>29<span class="cite-bracket">]</span></a></sup><sup id="cite_ref-30" class="reference"><a href="#cite_note-30"><span class="cite-bracket">[</span>30<span class="cite-bracket">]</span></a></sup> NVidia provides a high-level Matrix <a href="/wiki/CUDA" title="CUDA">CUDA</a> API although the internal details are not available.<sup id="cite_ref-31" class="reference"><a href="#cite_note-31"><span class="cite-bracket">[</span>31<span class="cite-bracket">]</span></a></sup> The most resource-efficient technique is in-place reordering of access to otherwise linear vector data.</li> <li><b>Advanced Math formats</b> – often includes <a href="/wiki/Galois_field" class="mw-redirect" title="Galois field">Galois field</a> arithmetic, but can include <a href="/wiki/Binary-coded_decimal" title="Binary-coded decimal">binary-coded decimal</a> or decimal fixed-point, and support for much larger (arbitrary precision) arithmetic operations by supporting parallel carry-in and carry-out</li> <li><b><a href="/wiki/Bit_manipulation" title="Bit manipulation">Bit manipulation</a></b> – including vectorised versions of bit-level permutation operations, bitfield insert and extract, centrifuge operations, population count, and <a href="/wiki/Bit_Manipulation_Instruction_Sets" class="mw-redirect" title="Bit Manipulation Instruction Sets">many others</a>.</li></ul> <div class="mw-heading mw-heading3"><h3 id="GPU_vector_processing_features">GPU vector processing features</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=22" title="Edit section: GPU vector processing features"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>With many 3D <a href="/wiki/Shader" title="Shader">shader</a> applications needing <a href="/wiki/Trigonometric" class="mw-redirect" title="Trigonometric">trigonometric</a> operations as well as short vectors for common operations (RGB, ARGB, XYZ, XYZW) support for the following is typically present in modern GPUs, in addition to those found in vector processors: </p> <ul><li><b>Sub-vectors</b> – elements may typically contain two, three or four sub-elements (vec2, vec3, vec4) where any given bit of a predicate mask applies to the whole vec2/3/4, not the elements in the sub-vector. Sub-vectors are also introduced in RISC-V RVV (termed "LMUL").<sup id="cite_ref-32" class="reference"><a href="#cite_note-32"><span class="cite-bracket">[</span>32<span class="cite-bracket">]</span></a></sup> Subvectors are a critical integral part of the <a href="/wiki/Vulkan" title="Vulkan">Vulkan</a> <a href="/wiki/SPIR-V" class="mw-redirect" title="SPIR-V">SPIR-V</a> spec.</li> <li><b>Sub-vector Swizzle</b> – aka "Lane Shuffling" which allows sub-vector inter-element computations without needing extra (costly, wasteful) instructions to move the sub-elements into the correct SIMD "lanes" and also saves predicate mask bits. Effectively this is an in-flight <a href="/wiki/Permute_instruction" title="Permute instruction">mini-permute</a> of the sub-vector, heavily features in 3D Shader binaries, and is sufficiently important as to be part of the Vulkan SPIR-V spec. The Broadcom <a href="/wiki/Videocore" class="mw-redirect" title="Videocore">Videocore</a> IV uses the terminology "Lane rotate"<sup id="cite_ref-33" class="reference"><a href="#cite_note-33"><span class="cite-bracket">[</span>33<span class="cite-bracket">]</span></a></sup> where the rest of the industry uses the term <a href="/wiki/Swizzling_(computer_graphics)" title="Swizzling (computer graphics)">"swizzle"</a>.<sup id="cite_ref-34" class="reference"><a href="#cite_note-34"><span class="cite-bracket">[</span>34<span class="cite-bracket">]</span></a></sup></li> <li><b>Transcendentals</b> – <a href="/wiki/Trigonometric" class="mw-redirect" title="Trigonometric">trigonometric</a> operations such as <a href="/wiki/Sine" class="mw-redirect" title="Sine">sine</a>, <a href="/wiki/Cosine" class="mw-redirect" title="Cosine">cosine</a> and <a href="/wiki/Logarithm" title="Logarithm">logarithm</a> obviously feature much more predominantly in 3D than in many demanding <a href="/wiki/High-performance_computing" title="High-performance computing">HPC</a> workloads. Of interest, however, is that speed is far more important than accuracy in 3D for GPUs, where computation of pixel coordinates simply do not require high precision. The Vulkan specification recognises this and sets surprisingly low accuracy requirements, so that GPU Hardware can reduce power usage. The concept of reducing accuracy where it is simply not needed is explored in the <a href="/wiki/MIPS-3D" title="MIPS-3D">MIPS-3D</a> extension.</li></ul> <div class="mw-heading mw-heading3"><h3 id="Fault_(or_Fail)_First"><span id="Fault_.28or_Fail.29_First"></span>Fault (or Fail) First</h3><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=23" title="Edit section: Fault (or Fail) First"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Introduced in ARM SVE2 and RISC-V RVV is the concept of speculative sequential Vector Loads. ARM SVE2 has a special register named "First Fault Register",<sup id="cite_ref-35" class="reference"><a href="#cite_note-35"><span class="cite-bracket">[</span>35<span class="cite-bracket">]</span></a></sup> where RVV modifies (truncates) the Vector Length (VL).<sup id="cite_ref-36" class="reference"><a href="#cite_note-36"><span class="cite-bracket">[</span>36<span class="cite-bracket">]</span></a></sup> </p><p>The basic principle of ffirst is to attempt a large sequential Vector Load, but to allow the hardware to arbitrarily truncate the <i>actual</i> amount loaded to either the amount that would succeed without raising a memory fault or simply to an amount (greater than zero) that is most convenient. The important factor is that <i>subsequent</i> instructions are notified or may determine exactly how many Loads actually succeeded, using that quantity to only carry out work on the data that has actually been loaded. </p><p>Contrast this situation with SIMD, which is a fixed (inflexible) load width and fixed data processing width, unable to cope with loads that cross page boundaries, and even if they were they are unable to adapt to what actually succeeded, yet, paradoxically, if the SIMD program were to even attempt to find out in advance (in each inner loop, every time) what might optimally succeed, those instructions only serve to hinder performance because they would, by necessity, be part of the critical inner loop. </p><p>This begins to hint at the reason why ffirst is so innovative, and is best illustrated by memcpy or strcpy when implemented with standard 128-bit non-predicated non-ffirst SIMD. For IBM POWER9 the number of hand-optimised instructions to implement strncpy is in excess of 240.<sup id="cite_ref-37" class="reference"><a href="#cite_note-37"><span class="cite-bracket">[</span>37<span class="cite-bracket">]</span></a></sup> By contrast, the same strncpy routine in hand-optimised RVV assembler is a mere 22 instructions.<sup id="cite_ref-38" class="reference"><a href="#cite_note-38"><span class="cite-bracket">[</span>38<span class="cite-bracket">]</span></a></sup> </p><p>The above SIMD example could potentially fault and fail at the end of memory, due to attempts to read too many values: it could also cause significant numbers of page or misaligned faults by similarly crossing over boundaries. In contrast, by allowing the vector architecture the freedom to decide how many elements to load, the first part of a strncpy, if beginning initially on a sub-optimal memory boundary, may return just enough loads such that on <i>subsequent</i> iterations of the loop the batches of vectorised memory reads are optimally aligned with the underlying caches and virtual memory arrangements. Additionally, the hardware may choose to use the opportunity to end any given loop iteration's memory reads <i>exactly</i> on a page boundary (avoiding a costly second TLB lookup), with speculative execution preparing the next virtual memory page whilst data is still being processed in the current loop. All of this is determined by the hardware, not the program itself.<sup id="cite_ref-39" class="reference"><a href="#cite_note-39"><span class="cite-bracket">[</span>39<span class="cite-bracket">]</span></a></sup> </p> <div class="mw-heading mw-heading2"><h2 id="Performance_and_speed_up">Performance and speed up</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=24" title="Edit section: Performance and speed up"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <p>Let <i><b>r</b></i> be the vector speed ratio and <i><b>f</b></i> be the vectorization ratio. If the time taken for the vector unit to add an array of 64 numbers is 10 times faster than its equivalent scalar counterpart, r = 10. Also, if the total number of operations in a program is 100, out of which only 10 are scalar (after vectorization), then f = 0.9, i.e., 90% of the work is done by the vector unit. It follows the achievable speed up of: </p><p><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle r/[(1-f)*r+f]}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>r</mi> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mo stretchy="false">[</mo> <mo stretchy="false">(</mo> <mn>1</mn> <mo>−<!-- − --></mo> <mi>f</mi> <mo stretchy="false">)</mo> <mo>∗<!-- ∗ --></mo> <mi>r</mi> <mo>+</mo> <mi>f</mi> <mo stretchy="false">]</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle r/[(1-f)*r+f]}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/9d90d10baeb1dcab408e8abb5012b8743e8dc7d8" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:17.958ex; height:2.843ex;" alt="{\displaystyle r/[(1-f)*r+f]}"></span> </p><p>So, even if the performance of the vector unit is very high (<span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle r=\infty }"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mi>r</mi> <mo>=</mo> <mi mathvariant="normal">∞<!-- ∞ --></mi> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle r=\infty }</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/d38c8164ddb69351cdab28da290255fde3b846d4" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.338ex; width:6.471ex; height:1.676ex;" alt="{\displaystyle r=\infty }"></span>) there is a speedup less than <span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML" alttext="{\displaystyle 1/(1-f)}"> <semantics> <mrow class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0"> <mn>1</mn> <mrow class="MJX-TeXAtom-ORD"> <mo>/</mo> </mrow> <mo stretchy="false">(</mo> <mn>1</mn> <mo>−<!-- − --></mo> <mi>f</mi> <mo stretchy="false">)</mo> </mstyle> </mrow> <annotation encoding="application/x-tex">{\displaystyle 1/(1-f)}</annotation> </semantics> </math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/3b931cd65e871463804f32f213ac36f6340fd0a2" class="mwe-math-fallback-image-inline mw-invert skin-invert" aria-hidden="true" style="vertical-align: -0.838ex; width:9.416ex; height:2.843ex;" alt="{\displaystyle 1/(1-f)}"></span>, which suggests that the ratio <i><b>f</b></i> is crucial to the performance. This ratio depends on the efficiency of the compilation like adjacency of the elements in memory. </p> <div class="mw-heading mw-heading2"><h2 id="See_also">See also</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=25" title="Edit section: See also"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <ul><li><a href="/wiki/SX_architecture" class="mw-redirect" title="SX architecture">SX architecture</a></li> <li><a href="/wiki/Duncan%27s_taxonomy#Pipelined_vector_processors" title="Duncan's taxonomy">Duncan's taxonomy</a> on pipelined vector processors</li> <li><a href="/wiki/GPGPU" class="mw-redirect" title="GPGPU">GPGPU</a></li> <li><a href="/wiki/Compute_kernel" title="Compute kernel">Compute kernel</a></li> <li><a href="/wiki/Stream_processing" title="Stream processing">Stream processing</a></li> <li><a href="/wiki/Automatic_vectorization" title="Automatic vectorization">Automatic vectorization</a></li> <li><a href="/wiki/Chaining_(vector_processing)" title="Chaining (vector processing)">Chaining (vector processing)</a></li> <li><a href="/wiki/Computer_for_operations_with_functions" title="Computer for operations with functions">Computer for operations with functions</a></li> <li><a href="/wiki/RISC-V" title="RISC-V">RISC-V</a>, an open ISA standard with an associated variable width <a href="/wiki/RISC-V#Vector_set" title="RISC-V">vector extension</a>.</li> <li><a href="/wiki/Barrel_processor" title="Barrel processor">Barrel processor</a></li> <li><a href="/wiki/Tensor_Processing_Unit" title="Tensor Processing Unit">Tensor Processing Unit</a></li> <li><a href="/wiki/History_of_supercomputing" title="History of supercomputing">History of supercomputing</a></li> <li><a href="/wiki/Supercomputer_architecture" title="Supercomputer architecture">Supercomputer architecture</a></li></ul> <div class="mw-heading mw-heading2"><h2 id="References">References</h2><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Vector_processor&action=edit&section=26" title="Edit section: References"><span>edit</span></a><span class="mw-editsection-bracket">]</span></span></div> <style data-mw-deduplicate="TemplateStyles:r1239543626">.mw-parser-output .reflist{margin-bottom:0.5em;list-style-type:decimal}@media screen{.mw-parser-output .reflist{font-size:90%}}.mw-parser-output .reflist .references{font-size:100%;margin-bottom:0;list-style-type:inherit}.mw-parser-output .reflist-columns-2{column-width:30em}.mw-parser-output .reflist-columns-3{column-width:25em}.mw-parser-output .reflist-columns{margin-top:0.3em}.mw-parser-output .reflist-columns ol{margin-top:0}.mw-parser-output .reflist-columns li{page-break-inside:avoid;break-inside:avoid-column}.mw-parser-output .reflist-upper-alpha{list-style-type:upper-alpha}.mw-parser-output .reflist-upper-roman{list-style-type:upper-roman}.mw-parser-output .reflist-lower-alpha{list-style-type:lower-alpha}.mw-parser-output .reflist-lower-greek{list-style-type:lower-greek}.mw-parser-output .reflist-lower-roman{list-style-type:lower-roman}</style><div class="reflist"> <div class="mw-references-wrap mw-references-columns"><ol class="references"> <li id="cite_note-newscientist19760617_dap-1"><span class="mw-cite-backlink"><b><a href="#cite_ref-newscientist19760617_dap_1-0">^</a></b></span> <span class="reference-text"><style data-mw-deduplicate="TemplateStyles:r1238218222">.mw-parser-output cite.citation{font-style:inherit;word-wrap:break-word}.mw-parser-output .citation q{quotes:"\"""\"""'""'"}.mw-parser-output .citation:target{background-color:rgba(0,127,255,0.133)}.mw-parser-output .id-lock-free.id-lock-free a{background:url("//upload.wikimedia.org/wikipedia/commons/6/65/Lock-green.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-limited.id-lock-limited a,.mw-parser-output .id-lock-registration.id-lock-registration a{background:url("//upload.wikimedia.org/wikipedia/commons/d/d6/Lock-gray-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .id-lock-subscription.id-lock-subscription a{background:url("//upload.wikimedia.org/wikipedia/commons/a/aa/Lock-red-alt-2.svg")right 0.1em center/9px no-repeat}.mw-parser-output .cs1-ws-icon a{background:url("//upload.wikimedia.org/wikipedia/commons/4/4c/Wikisource-logo.svg")right 0.1em center/12px no-repeat}body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-free a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-limited a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-registration a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .id-lock-subscription a,body:not(.skin-timeless):not(.skin-minerva) .mw-parser-output .cs1-ws-icon a{background-size:contain;padding:0 1em 0 0}.mw-parser-output .cs1-code{color:inherit;background:inherit;border:none;padding:inherit}.mw-parser-output .cs1-hidden-error{display:none;color:var(--color-error,#d33)}.mw-parser-output .cs1-visible-error{color:var(--color-error,#d33)}.mw-parser-output .cs1-maint{display:none;color:#085;margin-left:0.3em}.mw-parser-output .cs1-kern-left{padding-left:0.2em}.mw-parser-output .cs1-kern-right{padding-right:0.2em}.mw-parser-output .citation .mw-selflink{font-weight:inherit}@media screen{.mw-parser-output .cs1-format{font-size:95%}html.skin-theme-clientpref-night .mw-parser-output .cs1-maint{color:#18911f}}@media screen and (prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .cs1-maint{color:#18911f}}</style><cite id="CITEREFParkinson1976" class="citation magazine cs1">Parkinson, Dennis (17 June 1976). <a rel="nofollow" class="external text" href="https://archive.org/details/bub_gb_m8S4bXj3dcMC/page/n11/mode/2up">"Computers by the thousand"</a>. <i>New Scientist</i>. pp. 626–627<span class="reference-accessdate">. Retrieved <span class="nowrap">7 July</span> 2024</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=article&rft.jtitle=New+Scientist&rft.atitle=Computers+by+the+thousand&rft.pages=626-627&rft.date=1976-06-17&rft.aulast=Parkinson&rft.aufirst=Dennis&rft_id=https%3A%2F%2Farchive.org%2Fdetails%2Fbub_gb_m8S4bXj3dcMC%2Fpage%2Fn11%2Fmode%2F2up&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-Malinovsky-2"><span class="mw-cite-backlink"><b><a href="#cite_ref-Malinovsky_2-0">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFB.N._Malinovsky1995" class="citation book cs1">B.N. Malinovsky (1995). <i>The history of computer technology in their faces (in Russian)</i>. KIT. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/5770761318" title="Special:BookSources/5770761318"><bdi>5770761318</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=The+history+of+computer+technology+in+their+faces+%28in+Russian%29&rft.pub=KIT&rft.date=1995&rft.isbn=5770761318&rft.au=B.N.+Malinovsky&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-3"><span class="mw-cite-backlink"><b><a href="#cite_ref-3">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="http://miaowgpu.org/">MIAOW Vertical Research Group</a></span> </li> <li id="cite_note-4"><span class="mw-cite-backlink"><b><a href="#cite_ref-4">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://github.com/VerticalResearchGroup/miaow/wiki/Architecture-Overview">MIAOW GPU</a></span> </li> <li id="cite_note-5"><span class="mw-cite-backlink"><b><a href="#cite_ref-5">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation pressrelease cs1"><a rel="nofollow" class="external text" href="https://www.globenewswire.com/en/news-release/2022/12/07/2569216/0/en/Andes-Announces-RISC-V-Multicore-1024-bit-Vector-Processor-AX45MPV.html">"Andes Announces RISC-V Multicore 1024-bit Vector Processor: AX45MPV"</a> (Press release). GlobeNewswire. 7 December 2022<span class="reference-accessdate">. Retrieved <span class="nowrap">23 December</span> 2022</span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Andes+Announces+RISC-V+Multicore+1024-bit+Vector+Processor%3A+AX45MPV&rft.pub=GlobeNewswire&rft.date=2022-12-07&rft_id=https%3A%2F%2Fwww.globenewswire.com%2Fen%2Fnews-release%2F2022%2F12%2F07%2F2569216%2F0%2Fen%2FAndes-Announces-RISC-V-Multicore-1024-bit-Vector-Processor-AX45MPV.html&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-6"><span class="mw-cite-backlink"><b><a href="#cite_ref-6">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMiyaokaChoiTogawaYanagisawa2002" class="citation conference cs1">Miyaoka, Y.; Choi, J.; Togawa, N.; Yanagisawa, M.; Ohtsuki, T. (2002). <i>An algorithm of hardware unit generation for processor core synthesis with packed SIMD type instructions</i>. Asia-Pacific Conference on Circuits and Systems. Vol. 1. pp. 171–176. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1109%2FAPCCAS.2002.1114930">10.1109/APCCAS.2002.1114930</a>. <a href="/wiki/Hdl_(identifier)" class="mw-redirect" title="Hdl (identifier)">hdl</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://hdl.handle.net/2065%2F10689">2065/10689</a></span>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=conference&rft.btitle=An+algorithm+of+hardware+unit+generation+for+processor+core+synthesis+with+packed+SIMD+type+instructions&rft.pages=171-176&rft.date=2002&rft_id=info%3Ahdl%2F2065%2F10689&rft_id=info%3Adoi%2F10.1109%2FAPCCAS.2002.1114930&rft.aulast=Miyaoka&rft.aufirst=Y.&rft.au=Choi%2C+J.&rft.au=Togawa%2C+N.&rft.au=Yanagisawa%2C+M.&rft.au=Ohtsuki%2C+T.&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-7"><span class="mw-cite-backlink"><b><a href="#cite_ref-7">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#sec-vector-config">"Riscv-v-spec/V-spec.adoc at master · riscv/Riscv-v-spec"</a>. <i><a href="/wiki/GitHub" title="GitHub">GitHub</a></i>. 16 June 2023.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=Riscv-v-spec%2FV-spec.adoc+at+master+%C2%B7+riscv%2FRiscv-v-spec&rft.date=2023-06-16&rft_id=https%3A%2F%2Fgithub.com%2Friscv%2Friscv-v-spec%2Fblob%2Fmaster%2Fv-spec.adoc%23sec-vector-config&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-8"><span class="mw-cite-backlink"><b><a href="#cite_ref-8">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://sxauroratsubasa.sakura.ne.jp/documents/sdk/pdfs/VectorEngine-as-manual-v1.3.pdf">"Vector Engine Assembly Language Reference Manual"</a> <span class="cs1-format">(PDF)</span>. 16 June 2023.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Vector+Engine+Assembly+Language+Reference+Manual&rft.date=2023-06-16&rft_id=https%3A%2F%2Fsxauroratsubasa.sakura.ne.jp%2Fdocuments%2Fsdk%2Fpdfs%2FVectorEngine-as-manual-v1.3.pdf&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-9"><span class="mw-cite-backlink"><b><a href="#cite_ref-9">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://developer.arm.com/tools-and-software/server-and-hpc/compile/arm-instruction-emulator/resources/tutorials/sve/sve-vs-sve2/single-page">"Documentation – Arm Developer"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Documentation+%E2%80%93+Arm+Developer&rft_id=https%3A%2F%2Fdeveloper.arm.com%2Ftools-and-software%2Fserver-and-hpc%2Fcompile%2Farm-instruction-emulator%2Fresources%2Ftutorials%2Fsve%2Fsve-vs-sve2%2Fsingle-page&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-10"><span class="mw-cite-backlink"><b><a href="#cite_ref-10">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="http://thebeardsage.com/vector-architecture/">"Vector Architecture"</a>. 27 April 2020.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Vector+Architecture&rft.date=2020-04-27&rft_id=http%3A%2F%2Fthebeardsage.com%2Fvector-architecture%2F&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-11"><span class="mw-cite-backlink"><b><a href="#cite_ref-11">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="http://www.inf.ed.ac.uk/teaching/courses/pa/Notes/lecture11-vector.pdf">Vector and SIMD processors, slides 12-13</a></span> </li> <li id="cite_note-12"><span class="mw-cite-backlink"><b><a href="#cite_ref-12">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://course.ece.cmu.edu/~ece740/f13/lib/exe/fetch.php?media=onur-740-fall13-module5.1.1-simd-and-gpus-part1.pdf">Array vs Vector Processing, slides 5-7</a></span> </li> <li id="cite_note-13"><span class="mw-cite-backlink"><b><a href="#cite_ref-13">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://course.ece.cmu.edu/~ece740/f13/lib/exe/fetch.php?media=seth-740-fall13-module5.1-simd-vector-gpu.pdf">SIMD vs Vector GPU, slides 22-24</a></span> </li> <li id="cite_note-14"><span class="mw-cite-backlink"><b><a href="#cite_ref-14">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFPattersonHennessy1998" class="citation book cs1"><a href="/wiki/David_Patterson_(computer_scientist)" title="David Patterson (computer scientist)">Patterson, David A.</a>; <a href="/wiki/John_L._Hennessy" title="John L. Hennessy">Hennessy, John L.</a> (1998). <span class="id-lock-registration" title="Free registration required"><a rel="nofollow" class="external text" href="https://archive.org/details/computerorganiz000henn"><i>Computer Organization and Design: the Hardware/Software Interface page 751-2</i></a></span> (2nd ed.). Morgan Kaufmann. p. <a rel="nofollow" class="external text" href="https://archive.org/details/computerorganiz000henn/page/751">751-2</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/155860491X" title="Special:BookSources/155860491X"><bdi>155860491X</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=book&rft.btitle=Computer+Organization+and+Design%3A+the+Hardware%2FSoftware+Interface+page+751-2&rft.pages=751-2&rft.edition=2nd&rft.pub=Morgan+Kaufmann&rft.date=1998&rft.isbn=155860491X&rft.aulast=Patterson&rft.aufirst=David+A.&rft.au=Hennessy%2C+John+L.&rft_id=https%3A%2F%2Farchive.org%2Fdetails%2Fcomputerorganiz000henn&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-15"><span class="mw-cite-backlink"><b><a href="#cite_ref-15">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc">"Riscv-v-spec/V-spec.adoc at master · riscv/Riscv-v-spec"</a>. <i><a href="/wiki/GitHub" title="GitHub">GitHub</a></i>. 19 November 2022.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=Riscv-v-spec%2FV-spec.adoc+at+master+%C2%B7+riscv%2FRiscv-v-spec&rft.date=2022-11-19&rft_id=https%3A%2F%2Fgithub.com%2Friscv%2Friscv-v-spec%2Fblob%2Fmaster%2Fv-spec.adoc&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-16"><span class="mw-cite-backlink"><b><a href="#cite_ref-16">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://github.com/hermanhermitage/videocoreiv/wiki/VideoCore-IV-Programmers-Manual">Videocore IV Programmer's Manual</a></span> </li> <li id="cite_note-17"><span class="mw-cite-backlink"><b><a href="#cite_ref-17">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://jbush001.github.io/2016/03/02/videocore-qpu-pipeline.html">Videocore IV QPU analysis by Jeff Bush</a></span> </li> <li id="cite_note-18"><span class="mw-cite-backlink"><b><a href="#cite_ref-18">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/coding-for-neon---part-3-matrix-multiplication">"Coding for Neon - Part 3 Matrix Multiplication"</a>. 11 September 2013.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Coding+for+Neon+-+Part+3+Matrix+Multiplication&rft.date=2013-09-11&rft_id=https%3A%2F%2Fcommunity.arm.com%2Fdeveloper%2Fip-products%2Fprocessors%2Fb%2Fprocessors-ip-blog%2Fposts%2Fcoding-for-neon---part-3-matrix-multiplication&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-19"><span class="mw-cite-backlink"><b><a href="#cite_ref-19">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://www.sigarch.org/simd-instructions-considered-harmful/">SIMD considered harmful</a></span> </li> <li id="cite_note-20"><span class="mw-cite-backlink"><b><a href="#cite_ref-20">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://developer.arm.com/documentation/102131/latest/">ARM SVE2 tutorial</a></span> </li> <li id="cite_note-21"><span class="mw-cite-backlink"><b><a href="#cite_ref-21">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://stackoverflow.com/questions/52782940/1-to-4-broadcast-and-4-to-1-reduce-in-avx-512">"Sse - 1-to-4 broadcast and 4-to-1 reduce in AVX-512"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Sse+-+1-to-4+broadcast+and+4-to-1+reduce+in+AVX-512&rft_id=https%3A%2F%2Fstackoverflow.com%2Fquestions%2F52782940%2F1-to-4-broadcast-and-4-to-1-reduce-in-avx-512&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-22"><span class="mw-cite-backlink"><b><a href="#cite_ref-22">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026">"Assembly - Fastest way to do horizontal SSE vector sum (Or other reduction)"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=Assembly+-+Fastest+way+to+do+horizontal+SSE+vector+sum+%28Or+other+reduction%29&rft_id=https%3A%2F%2Fstackoverflow.com%2Fquestions%2F6996764%2Ffastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction%2F35270026%2335270026&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-23"><span class="mw-cite-backlink"><b><a href="#cite_ref-23">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vector-reduction-operations">"Riscv-v-spec/V-spec.adoc at master · riscv/Riscv-v-spec"</a>. <i><a href="/wiki/GitHub" title="GitHub">GitHub</a></i>. 19 November 2022.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=GitHub&rft.atitle=Riscv-v-spec%2FV-spec.adoc+at+master+%C2%B7+riscv%2FRiscv-v-spec&rft.date=2022-11-19&rft_id=https%3A%2F%2Fgithub.com%2Friscv%2Friscv-v-spec%2Fblob%2Fmaster%2Fv-spec.adoc%23vector-reduction-operations&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-24"><span class="mw-cite-backlink"><b><a href="#cite_ref-24">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="http://www.lanl.gov/conferencess/salishan/salishan2004/scott.pdf">Cray Overview</a></span> </li> <li id="cite_note-25"><span class="mw-cite-backlink"><b><a href="#cite_ref-25">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc">RISC-V RVV ISA</a></span> </li> <li id="cite_note-26"><span class="mw-cite-backlink"><b><a href="#cite_ref-26">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://sx-aurora.github.io/posts/VE-HW-overview/">SX-Arora Overview</a></span> </li> <li id="cite_note-27"><span class="mw-cite-backlink"><b><a href="#cite_ref-27">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#vector-register-gather-instructions">RVV register gather-scatter instructions</a></span> </li> <li id="cite_note-28"><span class="mw-cite-backlink"><b><a href="#cite_ref-28">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://m.youtube.com/watch?v=27VRdI2BGWg&t=1260">"IBM's POWER10 Processor - William Starke & Brian W. Thompto, IBM"</a>. <i><a href="/wiki/YouTube" title="YouTube">YouTube</a></i>. <a rel="nofollow" class="external text" href="https://ghostarchive.org/varchive/youtube/20211211/27VRdI2BGWg">Archived</a> from the original on 2021-12-11.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=unknown&rft.jtitle=YouTube&rft.atitle=IBM%27s+POWER10+Processor+-+William+Starke+%26+Brian+W.+Thompto%2C+IBM&rft_id=https%3A%2F%2Fm.youtube.com%2Fwatch%3Fv%3D27VRdI2BGWg%26t%3D1260&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-29"><span class="mw-cite-backlink"><b><a href="#cite_ref-29">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFMoreiraBartonBattleBergner2021" class="citation arxiv cs1">Moreira, José E.; Barton, Kit; Battle, Steven; Bergner, Peter; Bertran, Ramon; Bhat, Puneeth; Caldeira, Pedro; Edelsohn, David; Fossum, Gordon; Frey, Brad; Ivanovic, Nemanja; Kerchner, Chip; Lim, Vincent; Kapoor, Shakti; Tulio Machado Filho; Silvia Melitta Mueller; Olsson, Brett; Sadasivam, Satish; Saleil, Baptiste; Schmidt, Bill; Srinivasaraghavan, Rajalakshmi; Srivatsan, Shricharan; Thompto, Brian; Wagner, Andreas; Wu, Nelson (2021). "A matrix math facility for Power ISA(TM) processors". <a href="/wiki/ArXiv_(identifier)" class="mw-redirect" title="ArXiv (identifier)">arXiv</a>:<span class="id-lock-free" title="Freely accessible"><a rel="nofollow" class="external text" href="https://arxiv.org/abs/2104.03142">2104.03142</a></span> [<a rel="nofollow" class="external text" href="https://arxiv.org/archive/cs.AR">cs.AR</a>].</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal&rft.genre=preprint&rft.jtitle=arXiv&rft.atitle=A+matrix+math+facility+for+Power+ISA%28TM%29+processors&rft.date=2021&rft_id=info%3Aarxiv%2F2104.03142&rft.aulast=Moreira&rft.aufirst=Jos%C3%A9+E.&rft.au=Barton%2C+Kit&rft.au=Battle%2C+Steven&rft.au=Bergner%2C+Peter&rft.au=Bertran%2C+Ramon&rft.au=Bhat%2C+Puneeth&rft.au=Caldeira%2C+Pedro&rft.au=Edelsohn%2C+David&rft.au=Fossum%2C+Gordon&rft.au=Frey%2C+Brad&rft.au=Ivanovic%2C+Nemanja&rft.au=Kerchner%2C+Chip&rft.au=Lim%2C+Vincent&rft.au=Kapoor%2C+Shakti&rft.au=Tulio+Machado+Filho&rft.au=Silvia+Melitta+Mueller&rft.au=Olsson%2C+Brett&rft.au=Sadasivam%2C+Satish&rft.au=Saleil%2C+Baptiste&rft.au=Schmidt%2C+Bill&rft.au=Srinivasaraghavan%2C+Rajalakshmi&rft.au=Srivatsan%2C+Shricharan&rft.au=Thompto%2C+Brian&rft.au=Wagner%2C+Andreas&rft.au=Wu%2C+Nelson&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-30"><span class="mw-cite-backlink"><b><a href="#cite_ref-30">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite id="CITEREFKrikelis1996" class="citation book cs1">Krikelis, Anargyros (1996). <a rel="nofollow" class="external text" href="https://link.springer.com/chapter/10.1007/978-1-4471-1011-8_8">"A Modular Massively Parallel Processor for Volumetric Visualisation Processing"</a>. <i>High Performance Computing for Computer Graphics and Visualisation</i>. pp. 101–124. <a href="/wiki/Doi_(identifier)" class="mw-redirect" title="Doi (identifier)">doi</a>:<a rel="nofollow" class="external text" href="https://doi.org/10.1007%2F978-1-4471-1011-8_8">10.1007/978-1-4471-1011-8_8</a>. <a href="/wiki/ISBN_(identifier)" class="mw-redirect" title="ISBN (identifier)">ISBN</a> <a href="/wiki/Special:BookSources/978-3-540-76016-0" title="Special:BookSources/978-3-540-76016-0"><bdi>978-3-540-76016-0</bdi></a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=bookitem&rft.atitle=A+Modular+Massively+Parallel+Processor+for+Volumetric+Visualisation+Processing&rft.btitle=High+Performance+Computing+for+Computer+Graphics+and+Visualisation&rft.pages=101-124&rft.date=1996&rft_id=info%3Adoi%2F10.1007%2F978-1-4471-1011-8_8&rft.isbn=978-3-540-76016-0&rft.aulast=Krikelis&rft.aufirst=Anargyros&rft_id=https%3A%2F%2Flink.springer.com%2Fchapter%2F10.1007%2F978-1-4471-1011-8_8&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-31"><span class="mw-cite-backlink"><b><a href="#cite_ref-31">^</a></b></span> <span class="reference-text"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1238218222"><cite class="citation web cs1"><a rel="nofollow" class="external text" href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma">"CUDA C++ Programming Guide"</a>.</cite><span title="ctx_ver=Z39.88-2004&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&rft.genre=unknown&rft.btitle=CUDA+C%2B%2B+Programming+Guide&rft_id=https%3A%2F%2Fdocs.nvidia.com%2Fcuda%2Fcuda-c-programming-guide%2Findex.html%23wmma&rfr_id=info%3Asid%2Fen.wikipedia.org%3AVector+processor" class="Z3988"></span></span> </li> <li id="cite_note-32"><span class="mw-cite-backlink"><b><a href="#cite_ref-32">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#mapping-for-lmul-1-2">LMUL > 1 in RVV</a></span> </li> <li id="cite_note-33"><span class="mw-cite-backlink"><b><a href="#cite_ref-33">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://patents.google.com/patent/US20110227920">Abandoned US patent US20110227920-0096</a></span> </li> <li id="cite_note-34"><span class="mw-cite-backlink"><b><a href="#cite_ref-34">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://github.com/hermanhermitage/videocoreiv-qpu">Videocore IV QPU</a></span> </li> <li id="cite_note-35"><span class="mw-cite-backlink"><b><a href="#cite_ref-35">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://developer.arm.com/tools-and-software/server-and-hpc/compile/arm-instruction-emulator/resources/tutorials/sve/sve-vs-sve2/single-page">Introduction to ARM SVE2</a></span> </li> <li id="cite_note-36"><span class="mw-cite-backlink"><b><a href="#cite_ref-36">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#unit-stride-fault-only-first-loads">RVV fault-first loads</a></span> </li> <li id="cite_note-37"><span class="mw-cite-backlink"><b><a href="#cite_ref-37">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://patchwork.ozlabs.org/project/glibc/patch/20200904165653.16202-1-rzinsly@linux.ibm.com/">PATCH to libc6 to add optimised POWER9 strncpy</a></span> </li> <li id="cite_note-38"><span class="mw-cite-backlink"><b><a href="#cite_ref-38">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://github.com/riscv/riscv-v-spec/blob/master/example/strncpy.s">RVV strncpy example</a></span> </li> <li id="cite_note-39"><span class="mw-cite-backlink"><b><a href="#cite_ref-39">^</a></b></span> <span class="reference-text"><a rel="nofollow" class="external text" href="https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf">ARM SVE2 paper by N. Stevens</a></span> </li> </ol></div></div> <p><br /> </p> <div class="navbox-styles"><style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><style data-mw-deduplicate="TemplateStyles:r1236075235">.mw-parser-output .navbox{box-sizing:border-box;border:1px solid #a2a9b1;width:100%;clear:both;font-size:88%;text-align:center;padding:1px;margin:1em auto 0}.mw-parser-output .navbox .navbox{margin-top:0}.mw-parser-output .navbox+.navbox,.mw-parser-output .navbox+.navbox-styles+.navbox{margin-top:-1px}.mw-parser-output .navbox-inner,.mw-parser-output .navbox-subgroup{width:100%}.mw-parser-output .navbox-group,.mw-parser-output .navbox-title,.mw-parser-output .navbox-abovebelow{padding:0.25em 1em;line-height:1.5em;text-align:center}.mw-parser-output .navbox-group{white-space:nowrap;text-align:right}.mw-parser-output .navbox,.mw-parser-output .navbox-subgroup{background-color:#fdfdfd}.mw-parser-output .navbox-list{line-height:1.5em;border-color:#fdfdfd}.mw-parser-output .navbox-list-with-group{text-align:left;border-left-width:2px;border-left-style:solid}.mw-parser-output tr+tr>.navbox-abovebelow,.mw-parser-output tr+tr>.navbox-group,.mw-parser-output tr+tr>.navbox-image,.mw-parser-output tr+tr>.navbox-list{border-top:2px solid #fdfdfd}.mw-parser-output .navbox-title{background-color:#ccf}.mw-parser-output .navbox-abovebelow,.mw-parser-output .navbox-group,.mw-parser-output .navbox-subgroup .navbox-title{background-color:#ddf}.mw-parser-output .navbox-subgroup .navbox-group,.mw-parser-output .navbox-subgroup .navbox-abovebelow{background-color:#e6e6ff}.mw-parser-output .navbox-even{background-color:#f7f7f7}.mw-parser-output .navbox-odd{background-color:transparent}.mw-parser-output .navbox .hlist td dl,.mw-parser-output .navbox .hlist td ol,.mw-parser-output .navbox .hlist td ul,.mw-parser-output .navbox td.hlist dl,.mw-parser-output .navbox td.hlist ol,.mw-parser-output .navbox td.hlist ul{padding:0.125em 0}.mw-parser-output .navbox .navbar{display:block;font-size:100%}.mw-parser-output .navbox-title .navbar{float:left;text-align:left;margin-right:0.5em}body.skin--responsive .mw-parser-output .navbox-image img{max-width:none!important}@media print{body.ns-0 .mw-parser-output .navbox{display:none!important}}</style></div><div role="navigation" class="navbox" aria-labelledby="Parallel_computing" style="padding:3px"><table class="nowraplinks hlist mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><style data-mw-deduplicate="TemplateStyles:r1239400231">.mw-parser-output .navbar{display:inline;font-size:88%;font-weight:normal}.mw-parser-output .navbar-collapse{float:left;text-align:left}.mw-parser-output .navbar-boxtext{word-spacing:0}.mw-parser-output .navbar ul{display:inline-block;white-space:nowrap;line-height:inherit}.mw-parser-output .navbar-brackets::before{margin-right:-0.125em;content:"[ "}.mw-parser-output .navbar-brackets::after{margin-left:-0.125em;content:" ]"}.mw-parser-output .navbar li{word-spacing:-0.125em}.mw-parser-output .navbar a>span,.mw-parser-output .navbar a>abbr{text-decoration:inherit}.mw-parser-output .navbar-mini abbr{font-variant:small-caps;border-bottom:none;text-decoration:none;cursor:inherit}.mw-parser-output .navbar-ct-full{font-size:114%;margin:0 7em}.mw-parser-output .navbar-ct-mini{font-size:114%;margin:0 4em}html.skin-theme-clientpref-night .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}@media(prefers-color-scheme:dark){html.skin-theme-clientpref-os .mw-parser-output .navbar li a abbr{color:var(--color-base)!important}}@media print{.mw-parser-output .navbar{display:none!important}}</style><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Parallel_computing" title="Template:Parallel computing"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Parallel_computing" title="Template talk:Parallel computing"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Parallel_computing" title="Special:EditPage/Template:Parallel computing"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Parallel_computing" style="font-size:114%;margin:0 4em"><a href="/wiki/Parallel_computing" title="Parallel computing">Parallel computing</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%">General</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Distributed_computing" title="Distributed computing">Distributed computing</a></li> <li><a href="/wiki/Parallel_computing" title="Parallel computing">Parallel computing</a></li> <li><a href="/wiki/Massively_parallel" title="Massively parallel">Massively parallel</a></li> <li><a href="/wiki/Cloud_computing" title="Cloud computing">Cloud computing</a></li> <li><a href="/wiki/High-performance_computing" title="High-performance computing">High-performance computing</a></li> <li><a href="/wiki/Multiprocessing" title="Multiprocessing">Multiprocessing</a></li> <li><a href="/wiki/Manycore_processor" title="Manycore processor">Manycore processor</a></li> <li><a href="/wiki/General-purpose_computing_on_graphics_processing_units" title="General-purpose computing on graphics processing units">GPGPU</a></li> <li><a href="/wiki/Computer_network" title="Computer network">Computer network</a></li> <li><a href="/wiki/Systolic_array" title="Systolic array">Systolic array</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Levels</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Bit-level_parallelism" title="Bit-level parallelism">Bit</a></li> <li><a href="/wiki/Instruction-level_parallelism" title="Instruction-level parallelism">Instruction</a></li> <li><a href="/wiki/Task_parallelism" title="Task parallelism">Thread</a></li> <li><a href="/wiki/Task_parallelism" title="Task parallelism">Task</a></li> <li><a href="/wiki/Data_parallelism" title="Data parallelism">Data</a></li> <li><a href="/wiki/Memory-level_parallelism" title="Memory-level parallelism">Memory</a></li> <li><a href="/wiki/Loop-level_parallelism" title="Loop-level parallelism">Loop</a></li> <li><a href="/wiki/Pipeline_(computing)" title="Pipeline (computing)">Pipeline</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Multithreading_(computer_architecture)" title="Multithreading (computer architecture)">Multithreading</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Temporal_multithreading" title="Temporal multithreading">Temporal</a></li> <li><a href="/wiki/Simultaneous_multithreading" title="Simultaneous multithreading">Simultaneous</a> (SMT)</li> <li><a href="/wiki/Simultaneous_and_heterogeneous_multithreading" title="Simultaneous and heterogeneous multithreading">Simultaneous and heterogenous</a></li> <li><a href="/wiki/Speculative_multithreading" title="Speculative multithreading">Speculative</a> (SpMT)</li> <li><a href="/wiki/Preemption_(computing)" title="Preemption (computing)">Preemptive</a></li> <li><a href="/wiki/Computer_multitasking#Cooperative_multitasking" title="Computer multitasking">Cooperative</a></li> <li><a href="/wiki/Bulldozer_(microarchitecture)#Bulldozer_core" title="Bulldozer (microarchitecture)">Clustered multi-thread</a> (CMT)</li> <li><a href="/wiki/Hardware_scout" title="Hardware scout">Hardware scout</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Theory</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Parallel_RAM" title="Parallel RAM">PRAM model</a></li> <li><a href="/wiki/Parallel_external_memory" title="Parallel external memory">PEM model</a></li> <li><a href="/wiki/Analysis_of_parallel_algorithms" title="Analysis of parallel algorithms">Analysis of parallel algorithms</a></li> <li><a href="/wiki/Amdahl%27s_law" title="Amdahl's law">Amdahl's law</a></li> <li><a href="/wiki/Gustafson%27s_law" title="Gustafson's law">Gustafson's law</a></li> <li><a href="/wiki/Cost_efficiency" title="Cost efficiency">Cost efficiency</a></li> <li><a href="/wiki/Karp%E2%80%93Flatt_metric" title="Karp–Flatt metric">Karp–Flatt metric</a></li> <li><a href="/wiki/Parallel_slowdown" title="Parallel slowdown">Slowdown</a></li> <li><a href="/wiki/Speedup" title="Speedup">Speedup</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Elements</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Process_(computing)" title="Process (computing)">Process</a></li> <li><a href="/wiki/Thread_(computing)" title="Thread (computing)">Thread</a></li> <li><a href="/wiki/Fiber_(computer_science)" title="Fiber (computer science)">Fiber</a></li> <li><a href="/wiki/Instruction_window" title="Instruction window">Instruction window</a></li> <li><a href="/wiki/Array_(data_structure)" title="Array (data structure)">Array</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Coordination</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Multiprocessing" title="Multiprocessing">Multiprocessing</a></li> <li><a href="/wiki/Memory_coherence" title="Memory coherence">Memory coherence</a></li> <li><a href="/wiki/Cache_coherence" title="Cache coherence">Cache coherence</a></li> <li><a href="/wiki/Cache_invalidation" title="Cache invalidation">Cache invalidation</a></li> <li><a href="/wiki/Barrier_(computer_science)" title="Barrier (computer science)">Barrier</a></li> <li><a href="/wiki/Synchronization_(computer_science)" title="Synchronization (computer science)">Synchronization</a></li> <li><a href="/wiki/Application_checkpointing" title="Application checkpointing">Application checkpointing</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer_programming" title="Computer programming">Programming</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Stream_processing" title="Stream processing">Stream processing</a></li> <li><a href="/wiki/Dataflow_programming" title="Dataflow programming">Dataflow programming</a></li> <li><a href="/wiki/Parallel_programming_model" title="Parallel programming model">Models</a> <ul><li><a href="/wiki/Implicit_parallelism" title="Implicit parallelism">Implicit parallelism</a></li> <li><a href="/wiki/Explicit_parallelism" title="Explicit parallelism">Explicit parallelism</a></li> <li><a href="/wiki/Concurrency_(computer_science)" title="Concurrency (computer science)">Concurrency</a></li></ul></li> <li><a href="/wiki/Non-blocking_algorithm" title="Non-blocking algorithm">Non-blocking algorithm</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer_hardware" title="Computer hardware">Hardware</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Flynn%27s_taxonomy" title="Flynn's taxonomy">Flynn's taxonomy</a> <ul><li><a href="/wiki/Single_instruction,_single_data" title="Single instruction, single data">SISD</a></li> <li><a href="/wiki/Single_instruction,_multiple_data" title="Single instruction, multiple data">SIMD</a> <ul><li><a href="/wiki/Single_instruction,_multiple_threads" title="Single instruction, multiple threads">Array processing</a> (SIMT)</li> <li><a href="/wiki/Flynn%27s_taxonomy#Pipelined_processor" title="Flynn's taxonomy">Pipelined processing</a></li> <li><a href="/wiki/Flynn%27s_taxonomy#Associative_processor" title="Flynn's taxonomy">Associative processing</a></li></ul></li> <li><a href="/wiki/Multiple_instruction,_single_data" title="Multiple instruction, single data">MISD</a></li> <li><a href="/wiki/Multiple_instruction,_multiple_data" title="Multiple instruction, multiple data">MIMD</a></li></ul></li> <li><a href="/wiki/Dataflow_architecture" title="Dataflow architecture">Dataflow architecture</a></li> <li><a href="/wiki/Instruction_pipelining" title="Instruction pipelining">Pipelined processor</a></li> <li><a href="/wiki/Superscalar_processor" title="Superscalar processor">Superscalar processor</a></li> <li><a class="mw-selflink selflink">Vector processor</a></li> <li><a href="/wiki/Multiprocessing" title="Multiprocessing">Multiprocessor</a> <ul><li><a href="/wiki/Symmetric_multiprocessing" title="Symmetric multiprocessing">symmetric</a></li> <li><a href="/wiki/Asymmetric_multiprocessing" title="Asymmetric multiprocessing">asymmetric</a></li></ul></li> <li><a href="/wiki/Semiconductor_memory" title="Semiconductor memory">Memory</a> <ul><li><a href="/wiki/Shared_memory" title="Shared memory">shared</a></li> <li><a href="/wiki/Distributed_memory" title="Distributed memory">distributed</a></li> <li><a href="/wiki/Distributed_shared_memory" title="Distributed shared memory">distributed shared</a></li> <li><a href="/wiki/Uniform_memory_access" title="Uniform memory access">UMA</a></li> <li><a href="/wiki/Non-uniform_memory_access" title="Non-uniform memory access">NUMA</a></li> <li><a href="/wiki/Cache-only_memory_architecture" title="Cache-only memory architecture">COMA</a></li></ul></li> <li><a href="/wiki/Massively_parallel" title="Massively parallel">Massively parallel</a> computer</li> <li><a href="/wiki/Computer_cluster" title="Computer cluster">Computer cluster</a> <ul><li><a href="/wiki/Beowulf_cluster" title="Beowulf cluster">Beowulf cluster</a></li></ul></li> <li><a href="/wiki/Grid_computing" title="Grid computing">Grid computer</a></li> <li><a href="/wiki/Hardware_acceleration" title="Hardware acceleration">Hardware acceleration</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/API" title="API">APIs</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Ateji_PX" title="Ateji PX">Ateji PX</a></li> <li><a href="/wiki/Boost_(C%2B%2B_libraries)" title="Boost (C++ libraries)">Boost</a></li> <li><a href="/wiki/Chapel_(programming_language)" title="Chapel (programming language)">Chapel</a></li> <li><a href="/wiki/HPX" title="HPX">HPX</a></li> <li><a href="/wiki/Charm%2B%2B" title="Charm++">Charm++</a></li> <li><a href="/wiki/Cilk" title="Cilk">Cilk</a></li> <li><a href="/wiki/Coarray_Fortran" title="Coarray Fortran">Coarray Fortran</a></li> <li><a href="/wiki/CUDA" title="CUDA">CUDA</a></li> <li><a href="/wiki/Dryad_(programming)" title="Dryad (programming)">Dryad</a></li> <li><a href="/wiki/C%2B%2B_AMP" title="C++ AMP">C++ AMP</a></li> <li><a href="/wiki/Global_Arrays" title="Global Arrays">Global Arrays</a></li> <li><a href="/wiki/GPUOpen" title="GPUOpen">GPUOpen</a></li> <li><a href="/wiki/Message_Passing_Interface" title="Message Passing Interface">MPI</a></li> <li><a href="/wiki/OpenMP" title="OpenMP">OpenMP</a></li> <li><a href="/wiki/OpenCL" title="OpenCL">OpenCL</a></li> <li><a href="/wiki/OpenHMPP" title="OpenHMPP">OpenHMPP</a></li> <li><a href="/wiki/OpenACC" title="OpenACC">OpenACC</a></li> <li><a href="/wiki/Parallel_Extensions" title="Parallel Extensions">Parallel Extensions</a></li> <li><a href="/wiki/Parallel_Virtual_Machine" title="Parallel Virtual Machine">PVM</a></li> <li><a href="/wiki/Pthreads" title="Pthreads">pthreads</a></li> <li><a href="/wiki/RaftLib" title="RaftLib">RaftLib</a></li> <li><a href="/wiki/ROCm" title="ROCm">ROCm</a></li> <li><a href="/wiki/Unified_Parallel_C" title="Unified Parallel C">UPC</a></li> <li><a href="/wiki/Threading_Building_Blocks" title="Threading Building Blocks">TBB</a></li> <li><a href="/wiki/ZPL_(programming_language)" class="mw-redirect" title="ZPL (programming language)">ZPL</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Problems</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Automatic_parallelization" title="Automatic parallelization">Automatic parallelization</a></li> <li><a href="/wiki/Deadlock_(computer_science)" title="Deadlock (computer science)">Deadlock</a></li> <li><a href="/wiki/Deterministic_algorithm" title="Deterministic algorithm">Deterministic algorithm</a></li> <li><a href="/wiki/Embarrassingly_parallel" title="Embarrassingly parallel">Embarrassingly parallel</a></li> <li><a href="/wiki/Parallel_slowdown" title="Parallel slowdown">Parallel slowdown</a></li> <li><a href="/wiki/Race_condition" title="Race condition">Race condition</a></li> <li><a href="/wiki/Software_lockout" title="Software lockout">Software lockout</a></li> <li><a href="/wiki/Scalability" title="Scalability">Scalability</a></li> <li><a href="/wiki/Starvation_(computer_science)" title="Starvation (computer science)">Starvation</a></li></ul> </div></td></tr><tr><td class="navbox-abovebelow" colspan="2"><div> <ul><li><span class="noviewer" typeof="mw:File"><span title="Category"><img alt="" src="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/16px-Symbol_category_class.svg.png" decoding="async" width="16" height="16" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/23px-Symbol_category_class.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/96/Symbol_category_class.svg/31px-Symbol_category_class.svg.png 2x" data-file-width="180" data-file-height="185" /></span></span> <a href="/wiki/Category:Parallel_computing" title="Category:Parallel computing">Category: Parallel computing</a></li></ul> </div></td></tr></tbody></table></div> <div class="navbox-styles"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1236075235"></div><div role="navigation" class="navbox" aria-labelledby="Processor_technologies" style="padding:3px"><table class="nowraplinks mw-collapsible autocollapse navbox-inner" style="border-spacing:0;background:transparent;color:inherit"><tbody><tr><th scope="col" class="navbox-title" colspan="2"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1129693374"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1239400231"><div class="navbar plainlinks hlist navbar-mini"><ul><li class="nv-view"><a href="/wiki/Template:Processor_technologies" title="Template:Processor technologies"><abbr title="View this template">v</abbr></a></li><li class="nv-talk"><a href="/wiki/Template_talk:Processor_technologies" title="Template talk:Processor technologies"><abbr title="Discuss this template">t</abbr></a></li><li class="nv-edit"><a href="/wiki/Special:EditPage/Template:Processor_technologies" title="Special:EditPage/Template:Processor technologies"><abbr title="Edit this template">e</abbr></a></li></ul></div><div id="Processor_technologies" style="font-size:114%;margin:0 4em"><a href="/wiki/Processor_(computing)" title="Processor (computing)">Processor technologies</a></div></th></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Model_of_computation" title="Model of computation">Models</a></th><td class="navbox-list-with-group navbox-list navbox-odd hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Abstract_machine" title="Abstract machine">Abstract machine</a></li> <li><a href="/wiki/Stored-program_computer" title="Stored-program computer">Stored-program computer</a></li> <li><a href="/wiki/Finite-state_machine" title="Finite-state machine">Finite-state machine</a> <ul><li><a href="/wiki/Finite-state_machine_with_datapath" class="mw-redirect" title="Finite-state machine with datapath">with datapath</a></li> <li><a href="/wiki/Hierarchical_state_machine" class="mw-redirect" title="Hierarchical state machine">Hierarchical</a></li> <li><a href="/wiki/Deterministic_finite_automaton" title="Deterministic finite automaton">Deterministic finite automaton</a></li> <li><a href="/wiki/Queue_automaton" title="Queue automaton">Queue automaton</a></li> <li><a href="/wiki/Cellular_automaton" title="Cellular automaton">Cellular automaton</a></li> <li><a href="/wiki/Quantum_cellular_automaton" title="Quantum cellular automaton">Quantum cellular automaton</a></li></ul></li> <li><a href="/wiki/Turing_machine" title="Turing machine">Turing machine</a> <ul><li><a href="/wiki/Alternating_Turing_machine" title="Alternating Turing machine">Alternating Turing machine</a></li> <li><a href="/wiki/Universal_Turing_machine" title="Universal Turing machine">Universal</a></li> <li><a href="/wiki/Post%E2%80%93Turing_machine" title="Post–Turing machine">Post–Turing</a></li> <li><a href="/wiki/Quantum_Turing_machine" title="Quantum Turing machine">Quantum</a></li> <li><a href="/wiki/Nondeterministic_Turing_machine" title="Nondeterministic Turing machine">Nondeterministic Turing machine</a></li> <li><a href="/wiki/Probabilistic_Turing_machine" title="Probabilistic Turing machine">Probabilistic Turing machine</a></li> <li><a href="/wiki/Hypercomputation" title="Hypercomputation">Hypercomputation</a></li> <li><a href="/wiki/Zeno_machine" title="Zeno machine">Zeno machine</a></li></ul></li> <li><a href="/wiki/History_of_general-purpose_CPUs#Belt_machine_architecture" title="History of general-purpose CPUs">Belt machine</a></li> <li><a href="/wiki/Stack_machine" title="Stack machine">Stack machine</a></li> <li><a href="/wiki/Register_machine" title="Register machine">Register machines</a> <ul><li><a href="/wiki/Counter_machine" title="Counter machine">Counter</a></li> <li><a href="/wiki/Pointer_machine" title="Pointer machine">Pointer</a></li> <li><a href="/wiki/Random-access_machine" title="Random-access machine">Random-access</a></li> <li><a href="/wiki/Random-access_stored-program_machine" title="Random-access stored-program machine">Random-access stored program</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer_architecture" title="Computer architecture">Architecture</a></th><td class="navbox-list-with-group navbox-list navbox-even hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Microarchitecture" title="Microarchitecture">Microarchitecture</a></li> <li><a href="/wiki/Von_Neumann_architecture" title="Von Neumann architecture">Von Neumann</a></li> <li><a href="/wiki/Harvard_architecture" title="Harvard architecture">Harvard</a> <ul><li><a href="/wiki/Modified_Harvard_architecture" title="Modified Harvard architecture">modified</a></li></ul></li> <li><a href="/wiki/Dataflow_architecture" title="Dataflow architecture">Dataflow</a></li> <li><a href="/wiki/Transport_triggered_architecture" title="Transport triggered architecture">Transport-triggered</a></li> <li><a href="/wiki/Cellular_architecture" title="Cellular architecture">Cellular</a></li> <li><a href="/wiki/Endianness" title="Endianness">Endianness</a></li> <li><a href="/wiki/Computer_data_storage" title="Computer data storage">Memory access</a> <ul><li><a href="/wiki/Non-uniform_memory_access" title="Non-uniform memory access">NUMA</a></li> <li><a href="/wiki/Uniform_memory_access" title="Uniform memory access">HUMA</a></li> <li><a href="/wiki/Load%E2%80%93store_architecture" title="Load–store architecture">Load–store</a></li> <li><a href="/wiki/Register%E2%80%93memory_architecture" title="Register–memory architecture">Register/memory</a></li></ul></li> <li><a href="/wiki/Cache_hierarchy" title="Cache hierarchy">Cache hierarchy</a></li> <li><a href="/wiki/Memory_hierarchy" title="Memory hierarchy">Memory hierarchy</a> <ul><li><a href="/wiki/Virtual_memory" title="Virtual memory">Virtual memory</a></li> <li><a href="/wiki/Secondary_storage" class="mw-redirect" title="Secondary storage">Secondary storage</a></li></ul></li> <li><a href="/wiki/Heterogeneous_System_Architecture" title="Heterogeneous System Architecture">Heterogeneous</a></li> <li><a href="/wiki/Fabric_computing" title="Fabric computing">Fabric</a></li> <li><a href="/wiki/Multiprocessing" title="Multiprocessing">Multiprocessing</a></li> <li><a href="/wiki/Cognitive_computing" title="Cognitive computing">Cognitive</a></li> <li><a href="/wiki/Neuromorphic_engineering" class="mw-redirect" title="Neuromorphic engineering">Neuromorphic</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Instruction_set_architecture" title="Instruction set architecture">Instruction set<br />architectures</a></th><td class="navbox-list-with-group navbox-list navbox-odd hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Types</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Orthogonal_instruction_set" title="Orthogonal instruction set">Orthogonal instruction set</a></li> <li><a href="/wiki/Complex_instruction_set_computer" title="Complex instruction set computer">CISC</a></li> <li><a href="/wiki/Reduced_instruction_set_computer" title="Reduced instruction set computer">RISC</a></li> <li><a href="/wiki/Application-specific_instruction_set_processor" title="Application-specific instruction set processor">Application-specific</a></li> <li><a href="/wiki/Explicit_data_graph_execution" title="Explicit data graph execution">EDGE</a> <ul><li><a href="/wiki/TRIPS_architecture" title="TRIPS architecture">TRIPS</a></li></ul></li> <li><a href="/wiki/Very_long_instruction_word" title="Very long instruction word">VLIW</a> <ul><li><a href="/wiki/Explicitly_parallel_instruction_computing" title="Explicitly parallel instruction computing">EPIC</a></li></ul></li> <li><a href="/wiki/Minimal_instruction_set_computer" title="Minimal instruction set computer">MISC</a></li> <li><a href="/wiki/One-instruction_set_computer" title="One-instruction set computer">OISC</a></li> <li><a href="/wiki/No_instruction_set_computing" title="No instruction set computing">NISC</a></li> <li><a href="/wiki/Zero_instruction_set_computer" class="mw-redirect" title="Zero instruction set computer">ZISC</a></li> <li><a href="/wiki/VISC_architecture" title="VISC architecture">VISC architecture</a></li> <li><a href="/wiki/Quantum_computing" title="Quantum computing">Quantum computing</a></li> <li><a href="/wiki/Comparison_of_instruction_set_architectures" title="Comparison of instruction set architectures">Comparison</a> <ul><li><a href="/wiki/Addressing_mode" title="Addressing mode">Addressing modes</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Instruction<br />sets</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Motorola_68000_series" title="Motorola 68000 series">Motorola 68000 series</a></li> <li><a href="/wiki/VAX" title="VAX">VAX</a></li> <li><a href="/wiki/PDP-11_architecture" title="PDP-11 architecture">PDP-11</a></li> <li><a href="/wiki/X86" title="X86">x86</a></li> <li><a href="/wiki/ARM_architecture_family" title="ARM architecture family">ARM</a></li> <li><a href="/wiki/Stanford_MIPS" title="Stanford MIPS">Stanford MIPS</a></li> <li><a href="/wiki/MIPS_architecture" title="MIPS architecture">MIPS</a></li> <li><a href="/wiki/MIPS-X" title="MIPS-X">MIPS-X</a></li> <li>Power <ul><li><a href="/wiki/IBM_POWER_architecture" title="IBM POWER architecture">POWER</a></li> <li><a href="/wiki/PowerPC" title="PowerPC">PowerPC</a></li> <li><a href="/wiki/Power_ISA" title="Power ISA">Power ISA</a></li></ul></li> <li><a href="/wiki/Clipper_architecture" title="Clipper architecture">Clipper architecture</a></li> <li><a href="/wiki/SPARC" title="SPARC">SPARC</a></li> <li><a href="/wiki/SuperH" title="SuperH">SuperH</a></li> <li><a href="/wiki/DEC_Alpha" title="DEC Alpha">DEC Alpha</a></li> <li><a href="/wiki/ETRAX_CRIS" title="ETRAX CRIS">ETRAX CRIS</a></li> <li><a href="/wiki/M32R" title="M32R">M32R</a></li> <li><a href="/wiki/Unicore" title="Unicore">Unicore</a></li> <li><a href="/wiki/IA-64" title="IA-64">Itanium</a></li> <li><a href="/wiki/OpenRISC" title="OpenRISC">OpenRISC</a></li> <li><a href="/wiki/RISC-V" title="RISC-V">RISC-V</a></li> <li><a href="/wiki/MicroBlaze" title="MicroBlaze">MicroBlaze</a></li> <li><a href="/wiki/Little_man_computer" title="Little man computer">LMC</a></li> <li>System/3x0 <ul><li><a href="/wiki/IBM_System/360_architecture" title="IBM System/360 architecture">S/360</a></li> <li><a href="/wiki/IBM_System/370" title="IBM System/370">S/370</a></li> <li><a href="/wiki/IBM_System/390" title="IBM System/390">S/390</a></li> <li><a href="/wiki/Z/Architecture" title="Z/Architecture">z/Architecture</a></li></ul></li> <li>Tilera ISA</li> <li><a href="/wiki/VISC_architecture" title="VISC architecture">VISC architecture</a></li> <li><a href="/wiki/Adapteva#Products" class="mw-redirect" title="Adapteva">Epiphany architecture</a></li> <li><a href="/wiki/Comparison_of_instruction_set_architectures" title="Comparison of instruction set architectures">Others</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Instruction_cycle" title="Instruction cycle">Execution</a></th><td class="navbox-list-with-group navbox-list navbox-odd hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Instruction_pipelining" title="Instruction pipelining">Instruction pipelining</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Pipeline_stall" title="Pipeline stall">Pipeline stall</a></li> <li><a href="/wiki/Operand_forwarding" title="Operand forwarding">Operand forwarding</a></li> <li><a href="/wiki/Classic_RISC_pipeline" title="Classic RISC pipeline">Classic RISC pipeline</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Hazard_(computer_architecture)" title="Hazard (computer architecture)">Hazards</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Data_dependency" title="Data dependency">Data dependency</a></li> <li><a href="/wiki/Structural_hazard" class="mw-redirect" title="Structural hazard">Structural</a></li> <li><a href="/wiki/Control_hazard" class="mw-redirect" title="Control hazard">Control</a></li> <li><a href="/wiki/False_sharing" title="False sharing">False sharing</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Out-of-order_execution" title="Out-of-order execution">Out-of-order</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Scoreboarding" title="Scoreboarding">Scoreboarding</a></li> <li><a href="/wiki/Tomasulo%27s_algorithm" title="Tomasulo's algorithm">Tomasulo's algorithm</a> <ul><li><a href="/wiki/Reservation_station" title="Reservation station">Reservation station</a></li> <li><a href="/wiki/Re-order_buffer" title="Re-order buffer">Re-order buffer</a></li></ul></li> <li><a href="/wiki/Register_renaming" title="Register renaming">Register renaming</a></li> <li><a href="/wiki/Wide-issue" title="Wide-issue">Wide-issue</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Speculative_execution" title="Speculative execution">Speculative</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Branch_predictor" title="Branch predictor">Branch prediction</a></li> <li><a href="/wiki/Memory_dependence_prediction" title="Memory dependence prediction">Memory dependence prediction</a></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Parallel_computing" title="Parallel computing">Parallelism</a></th><td class="navbox-list-with-group navbox-list navbox-odd hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"></div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">Level</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Bit-level_parallelism" title="Bit-level parallelism">Bit</a> <ul><li><a href="/wiki/Bit-serial_architecture" title="Bit-serial architecture">Bit-serial</a></li> <li><a href="/wiki/Word_(computer_architecture)" title="Word (computer architecture)">Word</a></li></ul></li> <li><a href="/wiki/Instruction-level_parallelism" title="Instruction-level parallelism">Instruction</a></li> <li><a href="/wiki/Instruction_pipelining" title="Instruction pipelining">Pipelining</a> <ul><li><a href="/wiki/Scalar_processor" title="Scalar processor">Scalar</a></li> <li><a href="/wiki/Superscalar_processor" title="Superscalar processor">Superscalar</a></li></ul></li> <li><a href="/wiki/Task_parallelism" title="Task parallelism">Task</a> <ul><li><a href="/wiki/Thread_(computing)" title="Thread (computing)">Thread</a></li> <li><a href="/wiki/Process_(computing)" title="Process (computing)">Process</a></li></ul></li> <li><a href="/wiki/Data_parallelism" title="Data parallelism">Data</a> <ul><li><a class="mw-selflink selflink">Vector</a></li></ul></li> <li><a href="/wiki/Memory-level_parallelism" title="Memory-level parallelism">Memory</a></li> <li><a href="/wiki/Distributed_architecture" class="mw-redirect" title="Distributed architecture">Distributed</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Multithreading_(computer_architecture)" title="Multithreading (computer architecture)">Multithreading</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Temporal_multithreading" title="Temporal multithreading">Temporal</a></li> <li><a href="/wiki/Simultaneous_multithreading" title="Simultaneous multithreading">Simultaneous</a> <ul><li><a href="/wiki/Hyper-threading" title="Hyper-threading">Hyperthreading</a></li> <li><a href="/wiki/Simultaneous_and_heterogeneous_multithreading" title="Simultaneous and heterogeneous multithreading">Simultaneous and heterogenous</a></li></ul></li> <li><a href="/wiki/Speculative_multithreading" title="Speculative multithreading">Speculative</a></li> <li><a href="/wiki/Preemption_(computing)" title="Preemption (computing)">Preemptive</a></li> <li><a href="/wiki/Cooperative_multitasking" title="Cooperative multitasking">Cooperative</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Flynn%27s_taxonomy" title="Flynn's taxonomy">Flynn's taxonomy</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Single_instruction,_single_data" title="Single instruction, single data">SISD</a></li> <li><a href="/wiki/Single_instruction,_multiple_data" title="Single instruction, multiple data">SIMD</a> <ul><li><a href="/wiki/Single_instruction,_multiple_threads" title="Single instruction, multiple threads">Array processing (SIMT)</a></li> <li><a href="/wiki/Flynn%27s_taxonomy#Pipelined_processor" title="Flynn's taxonomy">Pipelined processing</a></li> <li><a href="/wiki/Flynn%27s_taxonomy#Associative_processor" title="Flynn's taxonomy">Associative processing</a></li> <li><a href="/wiki/SWAR" title="SWAR">SWAR</a></li></ul></li> <li><a href="/wiki/Multiple_instruction,_single_data" title="Multiple instruction, single data">MISD</a></li> <li><a href="/wiki/Multiple_instruction,_multiple_data" title="Multiple instruction, multiple data">MIMD</a> <ul><li><a href="/wiki/Single_program,_multiple_data" title="Single program, multiple data">SPMD</a></li></ul></li></ul> </div></td></tr></tbody></table><div></div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Computer_performance" title="Computer performance">Processor<br />performance</a></th><td class="navbox-list-with-group navbox-list navbox-even hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Transistor_count" title="Transistor count">Transistor count</a></li> <li><a href="/wiki/Instructions_per_cycle" title="Instructions per cycle">Instructions per cycle</a> (IPC) <ul><li><a href="/wiki/Cycles_per_instruction" title="Cycles per instruction">Cycles per instruction</a> (CPI)</li></ul></li> <li><a href="/wiki/Instructions_per_second" title="Instructions per second">Instructions per second</a> (IPS)</li> <li><a href="/wiki/FLOPS" class="mw-redirect" title="FLOPS">Floating-point operations per second</a> (FLOPS)</li> <li><a href="/wiki/Transactions_per_second" title="Transactions per second">Transactions per second</a> (TPS)</li> <li><a href="/wiki/SUPS" title="SUPS">Synaptic updates per second</a> (SUPS)</li> <li><a href="/wiki/Performance_per_watt" title="Performance per watt">Performance per watt</a> (PPW)</li> <li><a href="/wiki/Cache_performance_measurement_and_metric" title="Cache performance measurement and metric">Cache performance metrics</a></li> <li><a href="/wiki/Computer_performance_by_orders_of_magnitude" title="Computer performance by orders of magnitude">Computer performance by orders of magnitude</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Processor_(computing)" title="Processor (computing)">Types</a></th><td class="navbox-list-with-group navbox-list navbox-odd hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Central_processing_unit" title="Central processing unit">Central processing unit</a> (CPU)</li> <li><a href="/wiki/Graphics_processing_unit" title="Graphics processing unit">Graphics processing unit</a> (GPU) <ul><li><a href="/wiki/General-purpose_computing_on_graphics_processing_units" title="General-purpose computing on graphics processing units">GPGPU</a></li></ul></li> <li><a class="mw-selflink selflink">Vector</a></li> <li><a href="/wiki/Barrel_processor" title="Barrel processor">Barrel</a></li> <li><a href="/wiki/Stream_processing" title="Stream processing">Stream</a></li> <li><a href="/wiki/Tile_processor" title="Tile processor">Tile processor</a></li> <li><a href="/wiki/Coprocessor" title="Coprocessor">Coprocessor</a></li> <li><a href="/wiki/Programmable_Array_Logic" title="Programmable Array Logic">PAL</a></li> <li><a href="/wiki/Application-specific_integrated_circuit" title="Application-specific integrated circuit">ASIC</a></li> <li><a href="/wiki/Field-programmable_gate_array" title="Field-programmable gate array">FPGA</a></li> <li><a href="/wiki/Field-programmable_object_array" title="Field-programmable object array">FPOA</a></li> <li><a href="/wiki/Complex_programmable_logic_device" title="Complex programmable logic device">CPLD</a></li> <li><a href="/wiki/Multi-chip_module" title="Multi-chip module">Multi-chip module</a> (MCM)</li> <li><a href="/wiki/System_in_a_package" title="System in a package">System in a package</a> (SiP)</li> <li><a href="/wiki/Package_on_a_package" title="Package on a package">Package on a package</a> (PoP)</li></ul> </div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%">By application</th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Embedded_system" title="Embedded system">Embedded system</a></li> <li><a href="/wiki/Microprocessor" title="Microprocessor">Microprocessor</a></li> <li><a href="/wiki/Microcontroller" title="Microcontroller">Microcontroller</a></li> <li><a href="/wiki/Mobile_processor" title="Mobile processor">Mobile</a></li> <li><a href="/wiki/Ultra-low-voltage_processor" title="Ultra-low-voltage processor">Ultra-low-voltage</a></li> <li><a href="/wiki/Application-specific_instruction_set_processor" title="Application-specific instruction set processor">ASIP</a></li> <li><a href="/wiki/Soft_microprocessor" title="Soft microprocessor">Soft microprocessor</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Systems<br />on chip</th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/System_on_a_chip" title="System on a chip">System on a chip</a> (SoC)</li> <li><a href="/wiki/Multiprocessor_system_on_a_chip" class="mw-redirect" title="Multiprocessor system on a chip">Multiprocessor</a> (MPSoC)</li> <li><a href="/wiki/Cypress_PSoC" title="Cypress PSoC">Cypress PSoC</a></li> <li><a href="/wiki/Network_on_a_chip" title="Network on a chip">Network on a chip</a> (NoC)</li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Hardware_acceleration" title="Hardware acceleration">Hardware<br />accelerators</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Coprocessor" title="Coprocessor">Coprocessor</a></li> <li><a href="/wiki/AI_accelerator" title="AI accelerator">AI accelerator</a></li> <li><a href="/wiki/Graphics_processing_unit" title="Graphics processing unit">Graphics processing unit</a> (GPU)</li> <li><a href="/wiki/Image_processor" title="Image processor">Image processor</a></li> <li><a href="/wiki/Vision_processing_unit" title="Vision processing unit">Vision processing unit</a> (VPU)</li> <li><a href="/wiki/Physics_processing_unit" title="Physics processing unit">Physics processing unit</a> (PPU)</li> <li><a href="/wiki/Digital_signal_processor" title="Digital signal processor">Digital signal processor</a> (DSP)</li> <li><a href="/wiki/Tensor_Processing_Unit" title="Tensor Processing Unit">Tensor Processing Unit</a> (TPU)</li> <li><a href="/wiki/Secure_cryptoprocessor" title="Secure cryptoprocessor">Secure cryptoprocessor</a></li> <li><a href="/wiki/Network_processor" title="Network processor">Network processor</a></li> <li><a href="/wiki/Baseband_processor" title="Baseband processor">Baseband processor</a></li></ul> </div></td></tr></tbody></table><div> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Word_(computer_architecture)" title="Word (computer architecture)">Word size</a></th><td class="navbox-list-with-group navbox-list navbox-odd hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/1-bit_computing" title="1-bit computing">1-bit</a></li> <li><a href="/wiki/4-bit_computing" title="4-bit computing">4-bit</a></li> <li><a href="/wiki/8-bit_computing" title="8-bit computing">8-bit</a></li> <li><a href="/wiki/12-bit_computing" title="12-bit computing">12-bit</a></li> <li><a href="/wiki/Apollo_Guidance_Computer" title="Apollo Guidance Computer">15-bit</a></li> <li><a href="/wiki/16-bit_computing" title="16-bit computing">16-bit</a></li> <li><a href="/wiki/24-bit_computing" title="24-bit computing">24-bit</a></li> <li><a href="/wiki/32-bit_computing" title="32-bit computing">32-bit</a></li> <li><a href="/wiki/48-bit_computing" title="48-bit computing">48-bit</a></li> <li><a href="/wiki/64-bit_computing" title="64-bit computing">64-bit</a></li> <li><a href="/wiki/128-bit_computing" title="128-bit computing">128-bit</a></li> <li><a href="/wiki/256-bit_computing" title="256-bit computing">256-bit</a></li> <li><a href="/wiki/512-bit_computing" title="512-bit computing">512-bit</a></li> <li><a href="/wiki/Bit_slicing" title="Bit slicing">bit slicing</a></li> <li><a href="/wiki/Word_(computer_architecture)#Table_of_word_sizes" title="Word (computer architecture)">others</a> <ul><li><a href="/wiki/Word_(computer_architecture)#Variable-word_architectures" title="Word (computer architecture)">variable</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Core count</th><td class="navbox-list-with-group navbox-list navbox-even hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Single-core" title="Single-core">Single-core</a></li> <li><a href="/wiki/Multi-core_processor" title="Multi-core processor">Multi-core</a></li> <li><a href="/wiki/Manycore_processor" title="Manycore processor">Manycore</a></li> <li><a href="/wiki/Heterogeneous_computing" title="Heterogeneous computing">Heterogeneous architecture</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Components</th><td class="navbox-list-with-group navbox-list navbox-odd hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Central_processing_unit" title="Central processing unit">Core</a></li> <li><a href="/wiki/Cache_(computing)" title="Cache (computing)">Cache</a> <ul><li><a href="/wiki/CPU_cache" title="CPU cache">CPU cache</a></li> <li><a href="/wiki/Scratchpad_memory" title="Scratchpad memory">Scratchpad memory</a></li> <li><a href="/wiki/Data_cache" class="mw-redirect" title="Data cache">Data cache</a></li> <li><a href="/wiki/Instruction_cache" class="mw-redirect" title="Instruction cache">Instruction cache</a></li> <li><a href="/wiki/Cache_replacement_policies" title="Cache replacement policies">replacement policies</a></li> <li><a href="/wiki/Cache_coherence" title="Cache coherence">coherence</a></li></ul></li> <li><a href="/wiki/Bus_(computing)" title="Bus (computing)">Bus</a></li> <li><a href="/wiki/Clock_rate" title="Clock rate">Clock rate</a></li> <li><a href="/wiki/Clock_signal" title="Clock signal">Clock signal</a></li> <li><a href="/wiki/FIFO_(computing_and_electronics)" title="FIFO (computing and electronics)">FIFO</a></li></ul> </div><table class="nowraplinks navbox-subgroup" style="border-spacing:0"><tbody><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Execution_unit" title="Execution unit">Functional<br />units</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Arithmetic_logic_unit" title="Arithmetic logic unit">Arithmetic logic unit</a> (ALU)</li> <li><a href="/wiki/Address_generation_unit" title="Address generation unit">Address generation unit</a> (AGU)</li> <li><a href="/wiki/Floating-point_unit" title="Floating-point unit">Floating-point unit</a> (FPU)</li> <li><a href="/wiki/Memory_management_unit" title="Memory management unit">Memory management unit</a> (MMU) <ul><li><a href="/wiki/Load%E2%80%93store_unit" title="Load–store unit">Load–store unit</a></li> <li><a href="/wiki/Translation_lookaside_buffer" title="Translation lookaside buffer">Translation lookaside buffer</a> (TLB)</li></ul></li> <li><a href="/wiki/Branch_predictor" title="Branch predictor">Branch predictor</a></li> <li><a href="/wiki/Branch_target_predictor" title="Branch target predictor">Branch target predictor</a></li> <li><a href="/wiki/Memory_controller" title="Memory controller">Integrated memory controller</a> (IMC) <ul><li><a href="/wiki/Memory_management_unit" title="Memory management unit">Memory management unit</a></li></ul></li> <li><a href="/wiki/Instruction_decoder" class="mw-redirect" title="Instruction decoder">Instruction decoder</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Logic_gate" title="Logic gate">Logic</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Combinational_logic" title="Combinational logic">Combinational</a></li> <li><a href="/wiki/Sequential_logic" title="Sequential logic">Sequential</a></li> <li><a href="/wiki/Glue_logic" title="Glue logic">Glue</a></li> <li><a href="/wiki/Logic_gate" title="Logic gate">Logic gate</a> <ul><li><a href="/wiki/Quantum_logic_gate" title="Quantum logic gate">Quantum</a></li> <li><a href="/wiki/Gate_array" title="Gate array">Array</a></li></ul></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Hardware_register" title="Hardware register">Registers</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Processor_register" title="Processor register">Processor register</a></li> <li><a href="/wiki/Status_register" title="Status register">Status register</a></li> <li><a href="/wiki/Stack_register" title="Stack register">Stack register</a></li> <li><a href="/wiki/Register_file" title="Register file">Register file</a></li> <li><a href="/wiki/Memory_buffer_register" title="Memory buffer register">Memory buffer</a></li> <li><a href="/wiki/Memory_address_register" title="Memory address register">Memory address register</a></li> <li><a href="/wiki/Program_counter" title="Program counter">Program counter</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Control_unit" title="Control unit">Control unit</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Hardwired_control_unit" class="mw-redirect" title="Hardwired control unit">Hardwired control unit</a></li> <li><a href="/wiki/Instruction_unit" title="Instruction unit">Instruction unit</a></li> <li><a href="/wiki/Data_buffer" title="Data buffer">Data buffer</a></li> <li><a href="/wiki/Write_buffer" title="Write buffer">Write buffer</a></li> <li><a href="/wiki/Microcode" title="Microcode">Microcode</a> <a href="/wiki/ROM_image" title="ROM image">ROM</a></li> <li><a href="/wiki/Counter_(digital)" title="Counter (digital)">Counter</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Datapath" title="Datapath">Datapath</a></th><td class="navbox-list-with-group navbox-list navbox-even" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Multiplexer" title="Multiplexer">Multiplexer</a></li> <li><a href="/wiki/Demultiplexer" class="mw-redirect" title="Demultiplexer">Demultiplexer</a></li> <li><a href="/wiki/Adder_(electronics)" title="Adder (electronics)">Adder</a></li> <li><a href="/wiki/Binary_multiplier" title="Binary multiplier">Multiplier</a> <ul><li><a href="/wiki/CPU_multiplier" title="CPU multiplier">CPU</a></li></ul></li> <li><a href="/wiki/Binary_decoder" title="Binary decoder">Binary decoder</a> <ul><li><a href="/wiki/Address_decoder" title="Address decoder">Address decoder</a></li> <li><a href="/wiki/Sum-addressed_decoder" title="Sum-addressed decoder">Sum-addressed decoder</a></li></ul></li> <li><a href="/wiki/Barrel_shifter" title="Barrel shifter">Barrel shifter</a></li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Electronic_circuit" title="Electronic circuit">Circuitry</a></th><td class="navbox-list-with-group navbox-list navbox-odd" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Integrated_circuit" title="Integrated circuit">Integrated circuit</a> <ul><li><a href="/wiki/Three-dimensional_integrated_circuit" title="Three-dimensional integrated circuit">3D</a></li> <li><a href="/wiki/Mixed-signal_integrated_circuit" title="Mixed-signal integrated circuit">Mixed-signal</a></li> <li><a href="/wiki/Power_management_integrated_circuit" title="Power management integrated circuit">Power management</a></li></ul></li> <li><a href="/wiki/Boolean_circuit" title="Boolean circuit">Boolean</a></li> <li><a href="/wiki/Circuit_(computer_science)" title="Circuit (computer science)">Digital</a></li> <li><a href="/wiki/Analogue_electronics" title="Analogue electronics">Analog</a></li> <li><a href="/wiki/Quantum_circuit" title="Quantum circuit">Quantum</a></li> <li><a href="/wiki/Switch#Electronic_switches" title="Switch">Switch</a></li></ul> </div></td></tr></tbody></table><div> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%"><a href="/wiki/Power_management" title="Power management">Power<br />management</a></th><td class="navbox-list-with-group navbox-list navbox-even hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/Power_Management_Unit" title="Power Management Unit">PMU</a></li> <li><a href="/wiki/Advanced_Power_Management" title="Advanced Power Management">APM</a></li> <li><a href="/wiki/ACPI" title="ACPI">ACPI</a></li> <li><a href="/wiki/Dynamic_frequency_scaling" title="Dynamic frequency scaling">Dynamic frequency scaling</a></li> <li><a href="/wiki/Dynamic_voltage_scaling" title="Dynamic voltage scaling">Dynamic voltage scaling</a></li> <li><a href="/wiki/Clock_gating" title="Clock gating">Clock gating</a></li> <li><a href="/wiki/Performance_per_watt" title="Performance per watt">Performance per watt</a> (PPW)</li></ul> </div></td></tr><tr><th scope="row" class="navbox-group" style="width:1%">Related</th><td class="navbox-list-with-group navbox-list navbox-odd hlist" style="width:100%;padding:0"><div style="padding:0 0.25em"> <ul><li><a href="/wiki/History_of_general-purpose_CPUs" title="History of general-purpose CPUs">History of general-purpose CPUs</a></li> <li><a href="/wiki/Microprocessor_chronology" title="Microprocessor chronology">Microprocessor chronology</a></li> <li><a href="/wiki/Processor_design" title="Processor design">Processor design</a></li> <li><a href="/wiki/Digital_electronics" title="Digital electronics">Digital electronics</a></li> <li><a href="/wiki/Hardware_security_module" title="Hardware security module">Hardware security module</a></li> <li><a href="/wiki/Semiconductor_device_fabrication" title="Semiconductor device fabrication">Semiconductor device fabrication</a></li> <li><a href="/wiki/Tick%E2%80%93tock_model" title="Tick–tock model">Tick–tock model</a></li> <li><a href="/wiki/Pin_grid_array" title="Pin grid array">Pin grid array</a></li> <li><a href="/wiki/Chip_carrier" title="Chip carrier">Chip carrier</a></li></ul> </div></td></tr></tbody></table></div> <!-- NewPP limit report Parsed by mw‐web.codfw.main‐f69cdc8f6‐w24zn Cached time: 20241122142156 Cache expiry: 2592000 Reduced expiry: false Complications: [vary‐revision‐sha1, show‐toc] CPU time usage: 0.785 seconds Real time usage: 1.260 seconds Preprocessor visited node count: 4135/1000000 Post‐expand include size: 153698/2097152 bytes Template argument size: 5524/2097152 bytes Highest expansion depth: 15/100 Expensive parser function count: 44/500 Unstrip recursion depth: 1/20 Unstrip post‐expand size: 122623/5000000 bytes Lua time usage: 0.407/10.000 seconds Lua memory usage: 6902666/52428800 bytes Number of Wikibase entities loaded: 0/400 --> <!-- Transclusion expansion time report (%,ms,calls,template) 100.00% 1006.389 1 -total 22.15% 222.890 1 Template:Reflist 16.04% 161.405 15 Template:Code 13.16% 132.466 7 Template:Navbox 10.40% 104.640 1 Template:Short_description 10.34% 104.082 1 Template:Parallel_computing 10.01% 100.707 1 Template:Cite_magazine 8.43% 84.854 7 Template:Fix 5.51% 55.425 2 Template:Pagetype 5.16% 51.948 2 Template:Cn --> <!-- Saved in parser cache with key enwiki:pcache:idhash:58205-0!canonical and timestamp 20241122142156 and revision id 1255528928. Rendering was triggered because: page-view --> </div><!--esi <esi:include src="/esitest-fa8a495983347898/content" /> --><noscript><img src="https://login.wikimedia.org/wiki/Special:CentralAutoLogin/start?type=1x1" alt="" width="1" height="1" style="border: none; position: absolute;"></noscript> <div class="printfooter" data-nosnippet="">Retrieved from "<a dir="ltr" href="https://en.wikipedia.org/w/index.php?title=Vector_processor&oldid=1255528928">https://en.wikipedia.org/w/index.php?title=Vector_processor&oldid=1255528928</a>"</div></div> <div id="catlinks" class="catlinks" data-mw="interface"><div id="mw-normal-catlinks" class="mw-normal-catlinks"><a href="/wiki/Help:Category" title="Help:Category">Categories</a>: <ul><li><a href="/wiki/Category:Central_processing_unit" title="Category:Central processing unit">Central processing unit</a></li><li><a href="/wiki/Category:Coprocessors" title="Category:Coprocessors">Coprocessors</a></li><li><a href="/wiki/Category:Parallel_computing" title="Category:Parallel computing">Parallel computing</a></li><li><a href="/wiki/Category:Vector_supercomputers" title="Category:Vector supercomputers">Vector supercomputers</a></li></ul></div><div id="mw-hidden-catlinks" class="mw-hidden-catlinks mw-hidden-cats-hidden">Hidden categories: <ul><li><a href="/wiki/Category:Articles_with_short_description" title="Category:Articles with short description">Articles with short description</a></li><li><a href="/wiki/Category:Short_description_matches_Wikidata" title="Category:Short description matches Wikidata">Short description matches Wikidata</a></li><li><a href="/wiki/Category:All_articles_with_unsourced_statements" title="Category:All articles with unsourced statements">All articles with unsourced statements</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_July_2023" title="Category:Articles with unsourced statements from July 2023">Articles with unsourced statements from July 2023</a></li><li><a href="/wiki/Category:Articles_needing_additional_references_from_July_2023" title="Category:Articles needing additional references from July 2023">Articles needing additional references from July 2023</a></li><li><a href="/wiki/Category:All_articles_needing_additional_references" title="Category:All articles needing additional references">All articles needing additional references</a></li><li><a href="/wiki/Category:Articles_containing_potentially_dated_statements_from_2016" title="Category:Articles containing potentially dated statements from 2016">Articles containing potentially dated statements from 2016</a></li><li><a href="/wiki/Category:All_articles_containing_potentially_dated_statements" title="Category:All articles containing potentially dated statements">All articles containing potentially dated statements</a></li><li><a href="/wiki/Category:Articles_with_unsourced_statements_from_June_2021" title="Category:Articles with unsourced statements from June 2021">Articles with unsourced statements from June 2021</a></li><li><a href="/wiki/Category:Wikipedia_articles_needing_clarification_from_December_2023" title="Category:Wikipedia articles needing clarification from December 2023">Wikipedia articles needing clarification from December 2023</a></li><li><a href="/wiki/Category:Articles_with_specifically_marked_weasel-worded_phrases_from_November_2021" title="Category:Articles with specifically marked weasel-worded phrases from November 2021">Articles with specifically marked weasel-worded phrases from November 2021</a></li><li><a href="/wiki/Category:Wikipedia_articles_with_style_issues_from_November_2021" title="Category:Wikipedia articles with style issues from November 2021">Wikipedia articles with style issues from November 2021</a></li><li><a href="/wiki/Category:All_articles_with_style_issues" title="Category:All articles with style issues">All articles with style issues</a></li></ul></div></div> </div> </main> </div> <div class="mw-footer-container"> <footer id="footer" class="mw-footer" > <ul id="footer-info"> <li id="footer-info-lastmod"> This page was last edited on 5 November 2024, at 11:28<span class="anonymous-show"> (UTC)</span>.</li> <li id="footer-info-copyright">Text is available under the <a href="/wiki/Wikipedia:Text_of_the_Creative_Commons_Attribution-ShareAlike_4.0_International_License" title="Wikipedia:Text of the Creative Commons Attribution-ShareAlike 4.0 International License">Creative Commons Attribution-ShareAlike 4.0 License</a>; additional terms may apply. By using this site, you agree to the <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Terms_of_Use" class="extiw" title="foundation:Special:MyLanguage/Policy:Terms of Use">Terms of Use</a> and <a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy" class="extiw" title="foundation:Special:MyLanguage/Policy:Privacy policy">Privacy Policy</a>. Wikipedia® is a registered trademark of the <a rel="nofollow" class="external text" href="https://wikimediafoundation.org/">Wikimedia Foundation, Inc.</a>, a non-profit organization.</li> </ul> <ul id="footer-places"> <li id="footer-places-privacy"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy">Privacy policy</a></li> <li id="footer-places-about"><a href="/wiki/Wikipedia:About">About Wikipedia</a></li> <li id="footer-places-disclaimers"><a href="/wiki/Wikipedia:General_disclaimer">Disclaimers</a></li> <li id="footer-places-contact"><a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us">Contact Wikipedia</a></li> <li id="footer-places-wm-codeofconduct"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Universal_Code_of_Conduct">Code of Conduct</a></li> <li id="footer-places-developers"><a href="https://developer.wikimedia.org">Developers</a></li> <li id="footer-places-statslink"><a href="https://stats.wikimedia.org/#/en.wikipedia.org">Statistics</a></li> <li id="footer-places-cookiestatement"><a href="https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Cookie_statement">Cookie statement</a></li> <li id="footer-places-mobileview"><a href="//en.m.wikipedia.org/w/index.php?title=Vector_processor&mobileaction=toggle_view_mobile" class="noprint stopMobileRedirectToggle">Mobile view</a></li> </ul> <ul id="footer-icons" class="noprint"> <li id="footer-copyrightico"><a href="https://wikimediafoundation.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/static/images/footer/wikimedia-button.svg" width="84" height="29" alt="Wikimedia Foundation" loading="lazy"></a></li> <li id="footer-poweredbyico"><a href="https://www.mediawiki.org/" class="cdx-button cdx-button--fake-button cdx-button--size-large cdx-button--fake-button--enabled"><img src="/w/resources/assets/poweredby_mediawiki.svg" alt="Powered by MediaWiki" width="88" height="31" loading="lazy"></a></li> </ul> </footer> </div> </div> </div> <div class="vector-settings" id="p-dock-bottom"> <ul></ul> </div><script>(RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgHostname":"mw-web.codfw.main-f69cdc8f6-bklvv","wgBackendResponseTime":227,"wgPageParseReport":{"limitreport":{"cputime":"0.785","walltime":"1.260","ppvisitednodes":{"value":4135,"limit":1000000},"postexpandincludesize":{"value":153698,"limit":2097152},"templateargumentsize":{"value":5524,"limit":2097152},"expansiondepth":{"value":15,"limit":100},"expensivefunctioncount":{"value":44,"limit":500},"unstrip-depth":{"value":1,"limit":20},"unstrip-size":{"value":122623,"limit":5000000},"entityaccesscount":{"value":0,"limit":400},"timingprofile":["100.00% 1006.389 1 -total"," 22.15% 222.890 1 Template:Reflist"," 16.04% 161.405 15 Template:Code"," 13.16% 132.466 7 Template:Navbox"," 10.40% 104.640 1 Template:Short_description"," 10.34% 104.082 1 Template:Parallel_computing"," 10.01% 100.707 1 Template:Cite_magazine"," 8.43% 84.854 7 Template:Fix"," 5.51% 55.425 2 Template:Pagetype"," 5.16% 51.948 2 Template:Cn"]},"scribunto":{"limitreport-timeusage":{"value":"0.407","limit":"10.000"},"limitreport-memusage":{"value":6902666,"limit":52428800}},"cachereport":{"origin":"mw-web.codfw.main-f69cdc8f6-w24zn","timestamp":"20241122142156","ttl":2592000,"transientcontent":false}}});});</script> <script type="application/ld+json">{"@context":"https:\/\/schema.org","@type":"Article","name":"Vector processor","url":"https:\/\/en.wikipedia.org\/wiki\/Vector_processor","sameAs":"http:\/\/www.wikidata.org\/entity\/Q919509","mainEntity":"http:\/\/www.wikidata.org\/entity\/Q919509","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\/\/www.wikimedia.org\/static\/images\/wmf-hor-googpub.png"}},"datePublished":"2002-06-19T15:16:34Z","dateModified":"2024-11-05T11:28:35Z","headline":"computer processor which works on arrays of several numbers at once"}</script> </body> </html>