CINXE.COM
Effective Troubleshooting – Engineering Blog
<!doctype html> <html lang="en-US"> <head> <script type="text/javascript" src="https://cdn.cookielaw.org/consent/3959dac5-afc8-4ac6-a2c2-f91a2836f1c6/OtAutoBlock.js" ></script> <script src="https://cdn.cookielaw.org/scripttemplates/otSDKStub.js" type="text/javascript" charset="UTF-8" data-domain-script="3959dac5-afc8-4ac6-a2c2-f91a2836f1c6" ></script> <script type="text/javascript"> function OptanonWrapper() { } </script> <meta charset="UTF-8"> <link rel="profile" href="https://gmpg.org/xfn/11"> <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.6.0/jquery.min.js"></script> <title>Effective Troubleshooting – Engineering Blog</title> <meta name='robots' content='max-image-preview:large' /> <link rel="alternate" type="application/rss+xml" title="Engineering Blog » Feed" href="https://engineering.zeta.tech/feed/" /> <link rel="alternate" type="application/rss+xml" title="Engineering Blog » Comments Feed" href="https://engineering.zeta.tech/comments/feed/" /> <link rel="alternate" type="application/rss+xml" title="Engineering Blog » Effective Troubleshooting Comments Feed" href="https://engineering.zeta.tech/effective-troubleshooting/feed/" /> <script> window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.0.3\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.0.3\/svg\/","svgExt":".svg","source":{"concatemoji":"https:\/\/engineering.zeta.tech\/wp-includes\/js\/wp-emoji-release.min.js?ver=6.6.1"}}; /*! This file is auto-generated */ !function(i,n){var o,s,e;function c(e){try{var t={supportTests:e,timestamp:(new Date).valueOf()};sessionStorage.setItem(o,JSON.stringify(t))}catch(e){}}function p(e,t,n){e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(t,0,0);var t=new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data),r=(e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(n,0,0),new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data));return t.every(function(e,t){return e===r[t]})}function u(e,t,n){switch(t){case"flag":return n(e,"\ud83c\udff3\ufe0f\u200d\u26a7\ufe0f","\ud83c\udff3\ufe0f\u200b\u26a7\ufe0f")?!1:!n(e,"\ud83c\uddfa\ud83c\uddf3","\ud83c\uddfa\u200b\ud83c\uddf3")&&!n(e,"\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f","\ud83c\udff4\u200b\udb40\udc67\u200b\udb40\udc62\u200b\udb40\udc65\u200b\udb40\udc6e\u200b\udb40\udc67\u200b\udb40\udc7f");case"emoji":return!n(e,"\ud83d\udc26\u200d\u2b1b","\ud83d\udc26\u200b\u2b1b")}return!1}function f(e,t,n){var r="undefined"!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?new OffscreenCanvas(300,150):i.createElement("canvas"),a=r.getContext("2d",{willReadFrequently:!0}),o=(a.textBaseline="top",a.font="600 32px Arial",{});return e.forEach(function(e){o[e]=t(a,e,n)}),o}function t(e){var t=i.createElement("script");t.src=e,t.defer=!0,i.head.appendChild(t)}"undefined"!=typeof Promise&&(o="wpEmojiSettingsSupports",s=["flag","emoji"],n.supports={everything:!0,everythingExceptFlag:!0},e=new Promise(function(e){i.addEventListener("DOMContentLoaded",e,{once:!0})}),new Promise(function(t){var n=function(){try{var e=JSON.parse(sessionStorage.getItem(o));if("object"==typeof e&&"number"==typeof e.timestamp&&(new Date).valueOf()<e.timestamp+604800&&"object"==typeof e.supportTests)return e.supportTests}catch(e){}return null}();if(!n){if("undefined"!=typeof Worker&&"undefined"!=typeof OffscreenCanvas&&"undefined"!=typeof URL&&URL.createObjectURL&&"undefined"!=typeof Blob)try{var e="postMessage("+f.toString()+"("+[JSON.stringify(s),u.toString(),p.toString()].join(",")+"));",r=new Blob([e],{type:"text/javascript"}),a=new Worker(URL.createObjectURL(r),{name:"wpTestEmojiSupports"});return void(a.onmessage=function(e){c(n=e.data),a.terminate(),t(n)})}catch(e){}c(n=f(s,u,p))}t(n)}).then(function(e){for(var t in e)n.supports[t]=e[t],n.supports.everything=n.supports.everything&&n.supports[t],"flag"!==t&&(n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&n.supports[t]);n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&!n.supports.flag,n.DOMReady=!1,n.readyCallback=function(){n.DOMReady=!0}}).then(function(){return e}).then(function(){var e;n.supports.everything||(n.readyCallback(),(e=n.source||{}).concatemoji?t(e.concatemoji):e.wpemoji&&e.twemoji&&(t(e.twemoji),t(e.wpemoji)))}))}((window,document),window._wpemojiSettings); </script> <style id='wp-emoji-styles-inline-css'> img.wp-smiley, img.emoji { display: inline !important; border: none !important; box-shadow: none !important; height: 1em !important; width: 1em !important; margin: 0 0.07em !important; vertical-align: -0.1em !important; background: none !important; padding: 0 !important; } </style> <link rel='stylesheet' id='wp-block-library-css' href='https://engineering.zeta.tech/wp-includes/css/dist/block-library/style.min.css?ver=6.6.1' media='all' /> <style id='classic-theme-styles-inline-css'> /*! This file is auto-generated */ .wp-block-button__link{color:#fff;background-color:#32373c;border-radius:9999px;box-shadow:none;text-decoration:none;padding:calc(.667em + 2px) calc(1.333em + 2px);font-size:1.125em}.wp-block-file__button{background:#32373c;color:#fff;text-decoration:none} </style> <style id='global-styles-inline-css'> :root{--wp--preset--aspect-ratio--square: 1;--wp--preset--aspect-ratio--4-3: 4/3;--wp--preset--aspect-ratio--3-4: 3/4;--wp--preset--aspect-ratio--3-2: 3/2;--wp--preset--aspect-ratio--2-3: 2/3;--wp--preset--aspect-ratio--16-9: 16/9;--wp--preset--aspect-ratio--9-16: 9/16;--wp--preset--color--black: #000000;--wp--preset--color--cyan-bluish-gray: #abb8c3;--wp--preset--color--white: #ffffff;--wp--preset--color--pale-pink: #f78da7;--wp--preset--color--vivid-red: #cf2e2e;--wp--preset--color--luminous-vivid-orange: #ff6900;--wp--preset--color--luminous-vivid-amber: #fcb900;--wp--preset--color--light-green-cyan: #7bdcb5;--wp--preset--color--vivid-green-cyan: #00d084;--wp--preset--color--pale-cyan-blue: #8ed1fc;--wp--preset--color--vivid-cyan-blue: #0693e3;--wp--preset--color--vivid-purple: #9b51e0;--wp--preset--gradient--vivid-cyan-blue-to-vivid-purple: linear-gradient(135deg,rgba(6,147,227,1) 0%,rgb(155,81,224) 100%);--wp--preset--gradient--light-green-cyan-to-vivid-green-cyan: linear-gradient(135deg,rgb(122,220,180) 0%,rgb(0,208,130) 100%);--wp--preset--gradient--luminous-vivid-amber-to-luminous-vivid-orange: linear-gradient(135deg,rgba(252,185,0,1) 0%,rgba(255,105,0,1) 100%);--wp--preset--gradient--luminous-vivid-orange-to-vivid-red: linear-gradient(135deg,rgba(255,105,0,1) 0%,rgb(207,46,46) 100%);--wp--preset--gradient--very-light-gray-to-cyan-bluish-gray: linear-gradient(135deg,rgb(238,238,238) 0%,rgb(169,184,195) 100%);--wp--preset--gradient--cool-to-warm-spectrum: linear-gradient(135deg,rgb(74,234,220) 0%,rgb(151,120,209) 20%,rgb(207,42,186) 40%,rgb(238,44,130) 60%,rgb(251,105,98) 80%,rgb(254,248,76) 100%);--wp--preset--gradient--blush-light-purple: linear-gradient(135deg,rgb(255,206,236) 0%,rgb(152,150,240) 100%);--wp--preset--gradient--blush-bordeaux: linear-gradient(135deg,rgb(254,205,165) 0%,rgb(254,45,45) 50%,rgb(107,0,62) 100%);--wp--preset--gradient--luminous-dusk: linear-gradient(135deg,rgb(255,203,112) 0%,rgb(199,81,192) 50%,rgb(65,88,208) 100%);--wp--preset--gradient--pale-ocean: linear-gradient(135deg,rgb(255,245,203) 0%,rgb(182,227,212) 50%,rgb(51,167,181) 100%);--wp--preset--gradient--electric-grass: linear-gradient(135deg,rgb(202,248,128) 0%,rgb(113,206,126) 100%);--wp--preset--gradient--midnight: linear-gradient(135deg,rgb(2,3,129) 0%,rgb(40,116,252) 100%);--wp--preset--font-size--small: 13px;--wp--preset--font-size--medium: 20px;--wp--preset--font-size--large: 36px;--wp--preset--font-size--x-large: 42px;--wp--preset--spacing--20: 0.44rem;--wp--preset--spacing--30: 0.67rem;--wp--preset--spacing--40: 1rem;--wp--preset--spacing--50: 1.5rem;--wp--preset--spacing--60: 2.25rem;--wp--preset--spacing--70: 3.38rem;--wp--preset--spacing--80: 5.06rem;--wp--preset--shadow--natural: 6px 6px 9px rgba(0, 0, 0, 0.2);--wp--preset--shadow--deep: 12px 12px 50px rgba(0, 0, 0, 0.4);--wp--preset--shadow--sharp: 6px 6px 0px rgba(0, 0, 0, 0.2);--wp--preset--shadow--outlined: 6px 6px 0px -3px rgba(255, 255, 255, 1), 6px 6px rgba(0, 0, 0, 1);--wp--preset--shadow--crisp: 6px 6px 0px rgba(0, 0, 0, 1);}:where(.is-layout-flex){gap: 0.5em;}:where(.is-layout-grid){gap: 0.5em;}body .is-layout-flex{display: flex;}.is-layout-flex{flex-wrap: wrap;align-items: center;}.is-layout-flex > :is(*, div){margin: 0;}body .is-layout-grid{display: grid;}.is-layout-grid > :is(*, div){margin: 0;}:where(.wp-block-columns.is-layout-flex){gap: 2em;}:where(.wp-block-columns.is-layout-grid){gap: 2em;}:where(.wp-block-post-template.is-layout-flex){gap: 1.25em;}:where(.wp-block-post-template.is-layout-grid){gap: 1.25em;}.has-black-color{color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-color{color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-color{color: var(--wp--preset--color--white) !important;}.has-pale-pink-color{color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-color{color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-color{color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-color{color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-color{color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-color{color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-color{color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-color{color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-color{color: var(--wp--preset--color--vivid-purple) !important;}.has-black-background-color{background-color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-background-color{background-color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-background-color{background-color: var(--wp--preset--color--white) !important;}.has-pale-pink-background-color{background-color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-background-color{background-color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-background-color{background-color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-background-color{background-color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-background-color{background-color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-background-color{background-color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-background-color{background-color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-background-color{background-color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-background-color{background-color: var(--wp--preset--color--vivid-purple) !important;}.has-black-border-color{border-color: var(--wp--preset--color--black) !important;}.has-cyan-bluish-gray-border-color{border-color: var(--wp--preset--color--cyan-bluish-gray) !important;}.has-white-border-color{border-color: var(--wp--preset--color--white) !important;}.has-pale-pink-border-color{border-color: var(--wp--preset--color--pale-pink) !important;}.has-vivid-red-border-color{border-color: var(--wp--preset--color--vivid-red) !important;}.has-luminous-vivid-orange-border-color{border-color: var(--wp--preset--color--luminous-vivid-orange) !important;}.has-luminous-vivid-amber-border-color{border-color: var(--wp--preset--color--luminous-vivid-amber) !important;}.has-light-green-cyan-border-color{border-color: var(--wp--preset--color--light-green-cyan) !important;}.has-vivid-green-cyan-border-color{border-color: var(--wp--preset--color--vivid-green-cyan) !important;}.has-pale-cyan-blue-border-color{border-color: var(--wp--preset--color--pale-cyan-blue) !important;}.has-vivid-cyan-blue-border-color{border-color: var(--wp--preset--color--vivid-cyan-blue) !important;}.has-vivid-purple-border-color{border-color: var(--wp--preset--color--vivid-purple) !important;}.has-vivid-cyan-blue-to-vivid-purple-gradient-background{background: var(--wp--preset--gradient--vivid-cyan-blue-to-vivid-purple) !important;}.has-light-green-cyan-to-vivid-green-cyan-gradient-background{background: var(--wp--preset--gradient--light-green-cyan-to-vivid-green-cyan) !important;}.has-luminous-vivid-amber-to-luminous-vivid-orange-gradient-background{background: var(--wp--preset--gradient--luminous-vivid-amber-to-luminous-vivid-orange) !important;}.has-luminous-vivid-orange-to-vivid-red-gradient-background{background: var(--wp--preset--gradient--luminous-vivid-orange-to-vivid-red) !important;}.has-very-light-gray-to-cyan-bluish-gray-gradient-background{background: var(--wp--preset--gradient--very-light-gray-to-cyan-bluish-gray) !important;}.has-cool-to-warm-spectrum-gradient-background{background: var(--wp--preset--gradient--cool-to-warm-spectrum) !important;}.has-blush-light-purple-gradient-background{background: var(--wp--preset--gradient--blush-light-purple) !important;}.has-blush-bordeaux-gradient-background{background: var(--wp--preset--gradient--blush-bordeaux) !important;}.has-luminous-dusk-gradient-background{background: var(--wp--preset--gradient--luminous-dusk) !important;}.has-pale-ocean-gradient-background{background: var(--wp--preset--gradient--pale-ocean) !important;}.has-electric-grass-gradient-background{background: var(--wp--preset--gradient--electric-grass) !important;}.has-midnight-gradient-background{background: var(--wp--preset--gradient--midnight) !important;}.has-small-font-size{font-size: var(--wp--preset--font-size--small) !important;}.has-medium-font-size{font-size: var(--wp--preset--font-size--medium) !important;}.has-large-font-size{font-size: var(--wp--preset--font-size--large) !important;}.has-x-large-font-size{font-size: var(--wp--preset--font-size--x-large) !important;} :where(.wp-block-post-template.is-layout-flex){gap: 1.25em;}:where(.wp-block-post-template.is-layout-grid){gap: 1.25em;} :where(.wp-block-columns.is-layout-flex){gap: 2em;}:where(.wp-block-columns.is-layout-grid){gap: 2em;} :root :where(.wp-block-pullquote){font-size: 1.5em;line-height: 1.6;} </style> <link rel='stylesheet' id='zetaengnblog-style-css' href='https://engineering.zeta.tech/wp-content/themes/zetaengnblog/style.css?ver=1.0.0' media='all' /> <script id="core-js-js-extra"> var ajax_posts = {"ajaxurl":"https:\/\/engineering.zeta.tech\/wp-admin\/admin-ajax.php","noposts":"No older posts found"}; </script> <script src="https://engineering.zeta.tech/wp-content/themes/zetaengnblog/js/core.js?ver=6.6.1" id="core-js-js"></script> <link rel="https://api.w.org/" href="https://engineering.zeta.tech/wp-json/" /><link rel="alternate" title="JSON" type="application/json" href="https://engineering.zeta.tech/wp-json/wp/v2/posts/309" /><link rel="EditURI" type="application/rsd+xml" title="RSD" href="https://engineering.zeta.tech/xmlrpc.php?rsd" /> <link rel="canonical" href="https://engineering.zeta.tech/effective-troubleshooting/" /> <link rel='shortlink' href='https://engineering.zeta.tech/?p=309' /> <link rel="alternate" title="oEmbed (JSON)" type="application/json+oembed" href="https://engineering.zeta.tech/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fengineering.zeta.tech%2Feffective-troubleshooting%2F" /> <link rel="alternate" title="oEmbed (XML)" type="text/xml+oembed" href="https://engineering.zeta.tech/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fengineering.zeta.tech%2Feffective-troubleshooting%2F&format=xml" /> <link rel="pingback" href="https://engineering.zeta.tech/xmlrpc.php"> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=5"> <meta http-equiv="X-UA-Compatible" content="ie=edge"> <title> Home page </title> <meta name="description" content="Home page"> <meta name="description" content=""> <link rel="alternate" hreflang="en-US" href="" /> <link rel="canonical" href="" /> <meta name="x-sitemap-settings" content="daily,1"> <meta name="google-site-verification" content="Gyg3Spsr-pKlZLcNRhXHV_2X7aHr72bXxNgj-StYLpo" /> <!-- FACEBOOK OG META TAG --> <script src="https://unpkg.com/aos@next/dist/aos.js"></script> <meta property="og:title" content="Launch Next-Gen Credit Cards With Zeta" /> <meta property="og:description" content="Zeta is the world’s first Omni Stack for credit cards. A single stack for Origination, Processing, FRM, Rewards, Loans, APIs, and Apps" /> <meta property="og:image" content="https://s3.ap-south-1.amazonaws.com/zeta-website-images-prod/images/zeta-opengraph-preview.png" /> <meta property="og:type" content="website" /> <meta property="og:url" content="https://www.zeta.tech" /> <meta property="og:site_name" content="www.zeta.tech" /> <base href="/" /> <!-- Google Font --> <link rel="preconnect" href="https://fonts.googleapis.com"> <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> <link href="https://fonts.googleapis.com/css2?family=Inter:wght@200;400;500;600;700&family=Manrope:wght@400;500;600;700&display=swap" rel="stylesheet"> <!-- Google Font End --> </head> <body class=" main-page is-white-mobi-nav"> <!-- Web Banner View --> <section class="z-banner"> <div class="cover"></div> <div class="header-search"> <div class="z-container"> <form role="search" method="get" id="searchform" class="searchform" action="https://engineering.zeta.tech/"> <div> <label class="screen-reader-text" for="s">Search for:</label> <input type="search" id="search-form" placeholder="Search form" value="" name="s" id="s" /> <!-- <input type="submit" id="searchsubmit" value="Search" /> --> </div> </form> </div> </div> <div class="z-container"> <nav> <a href="/" class="logo-top" aria-label="Brand Logo"> <img src="/wp-content/uploads/2022/11/logo-white-xx.png" alt="Brand logo"/> </a> </nav> <h1 class="title"><a href="/" rel="home">Engineering @Zeta</a></h1> </div> </section> <section class="search-bar"> <div class="z-container text-right"> <button class="search-toggle"> <span></span> </button> </div> </section> <!-- / Web Banner View --> <!-- Inner Content View --> <div class="eng-blog-article__wrapper"> <div class="z-container"> <div class="section-inner"> <div id="post-309" class="post-309 post type-post status-publish format-standard has-post-thumbnail hentry category-uncategorized tag-api tag-observability tag-troubleshooting"> <div class="content"> <div class="post-header"> <a href="https://engineering.zeta.tech/effective-troubleshooting/" rel="bookmark">Effective Troubleshooting</a> </div> <!-- .post-header --> <div class="featured-media"> <img width="503" height="475" src="https://engineering.zeta.tech/wp-content/uploads/2022/09/6rsyo0-e1662101218245.jpeg" class="attachment-post-image size-post-image wp-post-image" alt="" decoding="async" fetchpriority="high" srcset="https://engineering.zeta.tech/wp-content/uploads/2022/09/6rsyo0-e1662101218245.jpeg 503w, https://engineering.zeta.tech/wp-content/uploads/2022/09/6rsyo0-e1662101218245-300x283.jpeg 300w" sizes="(max-width: 503px) 100vw, 503px" /> </div><!-- .featured-media --> <div class="post-content"> <p>Troubleshooting is a part of an engineer’s life. Whether it is API timeouts, issues with functionality, misconfigurations, or any number of other issues, we often need to roll up our sleeves and fix things. Based on my experience and tenure at <a href="http://www.zeta.tech" data-type="URL" data-id="www.zeta.tech">Zeta</a>, I would like to share some guidelines, learning resources, and tips and tricks that have helped me troubleshoot issues.</p> <h2 class="wp-block-heading"><strong>Guidelines</strong></h2> <p>Incidents can come anytime and challenge us in new ways. Continuous preparation and learning equip us to solve the incidents. There is an ever-growing list that will continue to evolve and I have attempted to capture some key information with 40’s in mind.</p> <h3 class="wp-block-heading"><strong>The 4Os</strong></h3> <ul class="wp-block-list"><li>Observability: <a href="https://opentelemetry.io/docs/concepts/signals/">Signals</a> emitted by the application contribute toward observability. Lack of observability affects the MTTD because in the absence of the right signals, troubleshooting is based on hypothesis. Therefore, it would require some trial and error to confirm and replicate before fixing.</li><li>Operability: Controls to operate the system like turning on and off features, updating configurations, bumping resources, restarting applications, etc. Good operability controls help solve the incidents once the root cause is identified and helps reduce MTTR.</li><li>Optimization: No system can support 1 Million TPS from the first day. Optimization needs to continuously happen and keep up with the expected traffic from our customers. This involves not only code changes but also tuning configurations, choice of resources, etc.</li><li>Onboarding: The majority of incidents of lower severity or issues might be due to misconfigurations. Right onboarding with proper steps becomes crucial to avoid incidents related to this.</li></ul> <h3 class="wp-block-heading"><strong>Preparing for Troubleshooting</strong></h3> <div class="wp-block-image"><figure class="aligncenter"><img decoding="async" src="https://lh6.googleusercontent.com/Y1RlAA1X2Er0brkCXhNJ_PL9Fh07Hscnx1ny50Odb9hrjInKAZYgrP4Xcr6IZ5-Uiv3yWDFrmKx_O_g5vDmeIhGlyPBGLnq1MoB5feOfIfOkCVslqFSvQJIilgExn-o70DWRcnWtWwHgY0hq_yHVT9_Ow_cS4mC-HeHvJyXbKoKHUoh2N6H7jSJ_oQ" alt=""/></figure></div> <p><strong>Preparing the Application</strong></p> <ul class="wp-block-list"><li>Ensure your application is publishing the right Signals.</li><li>Use structured logging as it helps in capturing important attributes in logs like entityID, requestID and analyze logs end to end.</li><li>Design the system and APIs using operability in mind. Always design CRUD APIs for an entity and make sure you can use them to fix data issues, disable product features temporarily, etc.</li><li>If operability controls cannot be exposed as APIs, do expose them via JMX. Operations that can be performed via JMX are as follows:<ul><li>Clear Cache</li><li>Change log levels</li><li>Disable features</li><li>Increase or decrease cache size</li></ul></li><li>Get PGWatch and PGBadger enabled for all the PostgreSQL databases your application connects to. This helps in troubleshooting querying performance. These can be monitored regularly.</li><li>Having a good test coverage might not seem related to incidents, but having 80% test cases not only helps prevent incidents but can also help in reproducing issues locally.</li><li>Performance benchmarking the application, having a dedicated setup, and knowing the TPS of your APIs helps to know the right configurations for your application in production, the known supported TPS and doing this exercise is in itself a good learning.</li><li>Have runbooks handy around the flows with mitigation steps.</li></ul> <h4 class="wp-block-heading"><strong>Preparing the Cluster</strong></h4> <div class="wp-block-image"><figure class="aligncenter"><img decoding="async" src="https://lh6.googleusercontent.com/S_EhTrk6ZbtO-ojLyqzbPAJZ43exymWn216ubpUXaeUmOUYZcgzGtC9YKtROjZhRWtyts_ag6EhNs1lYKUv8skyRyUp37mlzqXbQA7DVXjmL0h4Qi8gt3qQ5kH5A1LuTjQHASk824y2opt3pTVI2-PT7yxTRojvXLy0X2LqXynr3JgVhn373SSQnxQ" alt=""/></figure></div> <ul class="wp-block-list"><li>Ensure auditing is proper at all layers. This is very useful in determining which call landed for your APIs, how much time it took, and by whom it was called. </li><li>Ensure all the applications emit appropriate signals and can be effectively monitored.</li><li>Ensure customer services owned by the cluster are well known and documented and runbooks prepared for it.</li><li>Ensure Configurations to operate the system are properly documented, idempotent, and have minimal steps. Keep on iterating them to include new features.</li><li>Maintain a runbook containing flows documented by application and customer service containing known issues and resolution steps.</li></ul> <h4 class="wp-block-heading"><strong>Preparing Yourself</strong></h4> <div class="wp-block-image"><figure class="aligncenter"><img decoding="async" src="https://lh5.googleusercontent.com/7bIdkmRF-fSMAAV06RZYfPFd1R0sDZg02Qn5MoRcPQJtkYhC9EivSL8H5DiWhXyaQC9oJ5R42i0kGJhaHEv7qwn9movjof1s-g_tSA6T-SOyMdXsyN188MZ2QvVrQbvL8Zv1lfnW9oR6KTf-HteCxUlVnMc451n1EFzGpawNG2kzfBw8zj_vGowUQg" alt=""/></figure></div> <ul class="wp-block-list"><li>Be familiar with Observability Tools used. They provide a lot of insights while troubleshooting incidents.</li><li>Product Context helps a lot. Go through the resources like training videos, documents, and code to be familiar with the critical flows. Connecting the business context, domain, and technical context helps relate the issue with the impact and probable fault points.</li><li>Know thy system well and know the systems you depend on and the systems which depend on you better.</li><li>Familiarity with the tools like Kibana, Prometheus Queries, Grafana, Eclipse MAT, Kubectl, etc., helps a lot.</li></ul> <h3 class="wp-block-heading"><strong>During Troubleshooting</strong></h3> <ul class="wp-block-list"><li>While restarting to solve a problem, ensure you always take the thread and heap dump of the java process. It’s all about the evidence.</li><li>Always check if the problem is with only one instance due to multi-threading/concurrency issues. Deadlocks can cause that; in this case, taking a thread dump and restarting can quickly help resolve the issue.</li><li>There are many ways to solve a problem. Try to get out of the incident/issue first and then work on improvement. Sometimes out of the box solutions can save us a lot of time and get us out of tough situations.</li><li>When reporting an issue or passing the baton to another team, always provide as much supporting information as possible. Filling these in the FIR helps with triaging and sometimes can help in quick pointers based on a birds-eye view. Some examples are as follows:<ul><li>Kibana Link containing logs</li><li>Inputs Passed</li><li>APIs called</li><li>Errors observed</li><li>Time Window</li><li>Grafana Dashboards Link containing key metrics which might indicate a problem</li><li>Sequence of Steps performed</li><li>Configurations related to the issue</li><li>Reference for Code for your team or others </li></ul></li><li>Actively report your observations in the preferred internal communication medium of triaging issues or incidents. It helps in keeping the stakeholders posted and might help in parallel debugging and some Eureka moments.</li></ul> <h3 class="wp-block-heading"><strong>After Troubleshooting</strong></h3> <ul class="wp-block-list"><li>Rigor in RCA and IAI is very important. Whether the issue is for one user vs. an issue for millions of users, it does not matter, as ignoring an issue with lower impact might lead to the issue getting escalated in terms of impact.</li><li>Always try to identify IAIs. The IAIs can be process related, product related, or tech related. It might be specific to one cluster or applicable across, which is also acceptable.</li><li>When doing RCA of an incident, consider the 5 Whys to be completed only when you know the root cause that will fix the issue for good and avoid reoccurrence.</li><li>Capture all the evidence in the RCA Document. Since links expire, capturing screenshots helps.</li><li>Do not forget to prioritize the IAIs.</li><li>All the evidence may not be available. Refer to the Tips and Tricks section on how to solve these.</li><li>Find IAIs which improve the 4Os.</li></ul> <h2 class="wp-block-heading"><strong>Tips and Tricks</strong></h2> <div class="wp-block-image"><figure class="aligncenter"><img decoding="async" src="https://lh6.googleusercontent.com/RxOPdeUe1DDvm39qa3fjnq5Xz18j4dku8B8nA6leaZQbYCGsk-pPuI5mS9vKS0W231TFFnXlJ4YMjwInx6b-QatCpL4jzdgEJGMGEqVLc_hAREjViFqymBkYl_UuvrsnI8C7pH5Wa7PVaJ0w_BVUvPQxudUU-yHBvid7tb1EqjYWUoTF4KtOqVKyWQ" alt=""/></figure></div> <p>There are times when we may fall short on the observations and are unclear on what to do next. Some tips and tricks which can help are as follows:</p> <h3 class="wp-block-heading"><strong>Timeouts</strong></h3> <ul class="wp-block-list"><li>Ensure common libraries used have the right instrumentation and logs to track ingress and egress flows. If available, use them to check logs around that time window.</li><li>Add logs and metrics around ingress and egress flows for an application and simulate again to reproduce.</li><li>Check the resources allocated to Kubernetes pods. CPU throttling of even 1% can impact the application heavily.</li><li>Check the code line by line from source to destination to see inefficiencies. Some of the common inefficiencies are:<ul><li>Connection Pool settings for HTTP Calls.</li><li>Connection Pool settings for DB Calls.</li><li>Time taken by external calls. Percentile 95 and 99 metrics. Variations in them.</li><li>Requests getting queued in the executor used for external calls.</li><li>Time spent in Executor Queues.</li></ul></li><li>Check the data as problems might be with setup and specific input as the associated data might be the reason for inefficiency.</li></ul> <h3 class="wp-block-heading"><strong>Queries taking time on PostgreSQL RDS</strong></h3> <ul class="wp-block-list"><li>Use EXPLAIN and ANALYZE to check the query plan.</li><li>If your database queries are taking more than 20ms Percentile 95, especially for a table with less than 1 million rows, assume there is a problem and start analyzing the problem.</li><li>Slowest individual queries in PGBadger helps.</li><li>Ensure RDS has sufficient resources and is not running low on CPU, Memory, or IOPS.</li><li>Enable Performance Insights to monitor instance performance if not getting an idea of the issue.</li><li>PGWatch has nice dashboards which capture very useful information about what’s happening in the database. Checkout Juno integration in OWCC and ensure PGWatch is enabled with dashboards getting populated.</li></ul> <h2 class="wp-block-heading"><strong>Learning Resources</strong></h2> <ul class="wp-block-list"><li><a href="https://kubernetes.io/docs/reference/kubectl/cheatsheet/">kubectl cheatsheet</a></li><li><a href="https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/">Resource Management for Pods and Containers</a></li><li><a href="https://www.elastic.co/guide/en/elasticsearch/reference/7.7/query-dsl-query-string-query.html#query-string-syntax">Lucene Syntax</a></li><li><a href="https://static.googleusercontent.com/media/sre.google/en//static/pdf/building_secure_and_reliable_systems.pdf">Building Secure and Reliable Systems</a></li><li><a href="https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/CHAP_Storage.html">Amazon RDS DB instance storage</a></li><li><a href="https://zeta-tm.atlassian.net/wiki/spaces/AURA/pages/2658371182/PostgreSQL+Key+Optimization+Areas">PostgreSQL Key Optimization Areas</a></li></ul> <h2 class="wp-block-heading"><strong>Tools</strong></h2> <ul class="wp-block-list"><li><a href="https://www.eclipse.org/mat/">Eclipse MAT</a></li><li><a href="https://fastthread.io/">https://fastthread.io/</a></li><li><a href="https://minikube.sigs.k8s.io/docs/start/">Minikube</a></li></ul> <figure class="wp-block-image"><img decoding="async" src="https://lh6.googleusercontent.com/0xJ0w4-C1IhJh3hoRIDH_WxMeqHRCEICs9jvhIaLNy1FN9BhSCPSCIfTOeCZLcOHZ72cQPOaB5tvnj5nyok0EII7AXTlv3wX5E496074hOkAqilpQio9Gn3k0il0ZP6nGBscP7VqL0n-lPDIC_BtDAI9NJtJ5xbIC5sOpBmqCcFCzDQdsL52C9nrPQ" alt=""/></figure> <p>Author: Shubham Jha</p> <p>Edits by: Mercy Mochary, Swetha Kommi</p> <p>Reviewed by: Phani Marupaka</p> </div> <div class="post-meta-container"> <div class="post-author"> <div class="post-author-content"> <h4>Phani</h4> <div class="author-links"> <a class="author-link-posts" href="https://engineering.zeta.tech/author/phani_marpaka/">Author archive</a> </div><!-- .author-links --> </div><!-- .post-author-content --> </div><!-- .post-author --> <div class="post-meta"> <p class="post-date">September 2, 2022</p> <p class="post-categories"><a href="https://engineering.zeta.tech/category/uncategorized/" rel="category tag">Uncategorized</a></p> <p class="post-tags"><a href="https://engineering.zeta.tech/tag/api/" rel="tag">API</a>, <a href="https://engineering.zeta.tech/tag/observability/" rel="tag">Observability</a>, <a href="https://engineering.zeta.tech/tag/troubleshooting/" rel="tag">Troubleshooting</a></p> <div class="clear"></div> <!-- post-nav --> <div class="post-nav"> <a class="post-nav-prev" href="https://engineering.zeta.tech/zeta-tech-stack/">Previous post</a> <a class="post-nav-next" href="https://engineering.zeta.tech/prestoprestodb-what-it-offers-and-where-and-how-it-can-be-used/">Next post</a> <div class="clear"></div> </div><!-- .post-nav --> </div> <!-- post-meta --> <div class="clear"></div> </div> <div id="comments" class="comments-area"> <div id="respond" class="comment-respond"> <h3 id="reply-title" class="comment-reply-title">Leave a Reply <small><a rel="nofollow" id="cancel-comment-reply-link" href="/effective-troubleshooting/#respond" style="display:none;">Cancel reply</a></small></h3><form action="https://engineering.zeta.tech/wp-comments-post.php?wpe-comment-post=engnblog" method="post" id="commentform" class="comment-form" novalidate><p class="comment-notes"><span id="email-notes">Your email address will not be published.</span> <span class="required-field-message">Required fields are marked <span class="required">*</span></span></p><p class="comment-form-comment"><label for="comment">Comment <span class="required">*</span></label> <textarea id="comment" name="comment" cols="45" rows="8" maxlength="65525" required></textarea></p><p class="comment-form-author"><label for="author">Name <span class="required">*</span></label> <input id="author" name="author" type="text" value="" size="30" maxlength="245" autocomplete="name" required /></p> <p class="comment-form-email"><label for="email">Email <span class="required">*</span></label> <input id="email" name="email" type="email" value="" size="30" maxlength="100" aria-describedby="email-notes" autocomplete="email" required /></p> <p class="comment-form-url"><label for="url">Website</label> <input id="url" name="url" type="url" value="" size="30" maxlength="200" autocomplete="url" /></p> <p class="comment-form-cookies-consent"><input id="wp-comment-cookies-consent" name="wp-comment-cookies-consent" type="checkbox" value="yes" /> <label for="wp-comment-cookies-consent">Save my name, email, and website in this browser for the next time I comment.</label></p> <p class="form-submit"><input name="submit" type="submit" id="submit" class="submit" value="Post Comment" /> <input type='hidden' name='comment_post_ID' value='309' id='comment_post_ID' /> <input type='hidden' name='comment_parent' id='comment_parent' value='0' /> </p></form> </div><!-- #respond --> </div><!-- #comments --> </div> <div class="sidebar"> <div class="widget"> <input type="search" class="search-field" placeholder="Search form" value=""> </div> <div class="widget entries"> <div class="widget-content"> <h3 class="widget-title">Recent Posts</h3> <ul> <li> <a href="#" aria-current="page">Effective Troubleshooting</a> </li> <li> <a href="#">Zeta Tech Stack</a> </li> <li> <a href="#">Buy Now Pay Later: How Does it Work?</a> </li> <li> <a href="#">Buy Now Pay Later: The Real Deal or Just Hype</a> </li> <li> <a href="$">Buy Now Pay Later: The Modern Day Credit Card</a> </li> </ul> </div> </div> </div> <div class="clear"></div> </div> </div> </div> </div> <!-- / Inner Content View --> <!-- Footer --> <footer class="footer-section custom-footer"> <div class="z-container"> <div class="footer-table"> <div class="footer-table__row"> <div id="footer-sidebar1" class="footer-table__cell1 width__40"> <aside> <h3 class="widget-title">RECENT POSTS</h3> <ul> <li><a href="https://engineering.zeta.tech/prestoprestodb-what-it-offers-and-where-and-how-it-can-be-used/">Presto(PrestoDB)- What it Offers and Where and How it can be used</a> <span class="post-date">January 24, 2023</span></li> <li><a href="https://engineering.zeta.tech/effective-troubleshooting/">Effective Troubleshooting</a> <span class="post-date">September 2, 2022</span></li> <li><a href="https://engineering.zeta.tech/zeta-tech-stack/">Zeta Tech Stack</a> <span class="post-date">September 15, 2021</span></li> <li><a href="https://engineering.zeta.tech/buy-now-pay-later-how-does-it-work/">Buy Now Pay Later: How Does it Work?</a> <span class="post-date">July 26, 2021</span></li> <li><a href="https://engineering.zeta.tech/buy-now-pay-later-the-real-deal-or-just-hype/">Buy Now Pay Later: The Real Deal or Just Hype</a> <span class="post-date">July 26, 2021</span></li> </ul> </aside> </div> <div id="footer-sidebar2" class="footer-table__cell2"> <aside id="nav_menu-4" class="widget widget_nav_menu"><h3 class="widget-title">INTERESTING READS</h3><div class="menu-dummy-container"><ul id="menu-dummy" class="menu"><li id="menu-item-318" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-318"><a href="https://engineering.zeta.tech/bengaluru/">Bengaluru</a></li> <li id="menu-item-319" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-319"><a href="https://engineering.zeta.tech/hot-desking/">Hot Desking</a></li> <li id="menu-item-320" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-320"><a href="https://engineering.zeta.tech/groups/">Groups</a></li> </ul></div></aside></div> <div id="footer-sidebar3" class="footer-table__cell3"> <aside id="nav_menu-3" class="widget widget_nav_menu"><h3 class="widget-title">FOR ZETA EMPLOYEES</h3><div class="menu-general-container"><ul id="menu-general" class="menu"><li id="menu-item-313" class="menu-item menu-item-type-custom menu-item-object-custom menu-item-has-children menu-item-313"><a href="#">Hotdesk</a> <ul class="sub-menu"> <li id="menu-item-314" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-314"><a href="https://engineering.zeta.tech/bengaluru/">Bengaluru</a></li> <li id="menu-item-315" class="menu-item menu-item-type-post_type menu-item-object-page menu-item-315"><a href="https://engineering.zeta.tech/mumbai/">Mumbai</a></li> </ul> </li> </ul></div></aside></div> </div> </div> </div> </footer> <section class="footer-bar"> <div class="z-container flex"> <p class="footer-bar__left"> © 2022 <a href="#">Engineering @Zeta</a> <span> — © 2021 Better World Technology Pvt. Ltd.</span> </p> <p class="footer-bar__right"> <a class="goto-top" href="javascript:void(0)">Go Up ↑</a> </p> </div> </section> <!-- Footer End --> <script src="https://engineering.zeta.tech/wp-content/themes/zetaengnblog/js/navigation.js?ver=1.0.0" id="zetaengnblog-navigation-js"></script> <script src="https://engineering.zeta.tech/wp-includes/js/comment-reply.min.js?ver=6.6.1" id="comment-reply-js" async data-wp-strategy="async"></script> <script src="https://engineering.zeta.tech/wp-content/themes/zetaengnblog/js/main.9247e6bf2f7c0404c8a7.js?ver=1.0.0" id="zetaengnblog-blog-js"></script> <script src="https://engineering.zeta.tech/wp-content/themes/zetaengnblog/js/styles.e772312fa7689f75b2c1.js?ver=1.0.0" id="zetaengnblog-blog1-js"></script> </body> </html>