CINXE.COM
<!doctype html> <html lang="en" dir="ltr" class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated="false"> <head> <meta charset="UTF-8"> <meta name="generator" content="Docusaurus v3.0.1"> <title data-rh="true">LLM as judge | MLflow</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:image" content="https://your-docusaurus-site.example.com/img/docusaurus-social-card.jpg"><meta data-rh="true" name="twitter:image" content="https://your-docusaurus-site.example.com/img/docusaurus-social-card.jpg"><meta data-rh="true" property="og:url" content="https://your-docusaurus-site.example.com/blog/llm-as-judge"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="LLM as judge | MLflow"><meta data-rh="true" name="description" content="Perform LLM Evaluations with custom metrics"><meta data-rh="true" property="og:description" content="Perform LLM Evaluations with custom metrics"><meta data-rh="true" property="og:type" content="article"><meta data-rh="true" property="article:published_time" content="2024-10-03T00:00:00.000Z"><meta data-rh="true" property="article:author" content="https://www.linkedin.com/in/pedro-azevedo-/,https://www.linkedin.com/in/rahulpandey1901/"><meta data-rh="true" property="article:tag" content="genai,mlflow-evalaute"><link data-rh="true" rel="icon" href="/img/mlflow-favicon.ico"><link data-rh="true" rel="canonical" href="https://your-docusaurus-site.example.com/blog/llm-as-judge"><link data-rh="true" rel="alternate" href="https://your-docusaurus-site.example.com/blog/llm-as-judge" hreflang="en"><link data-rh="true" rel="alternate" href="https://your-docusaurus-site.example.com/blog/llm-as-judge" hreflang="x-default"><link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="MLflow RSS Feed"> <link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="MLflow Atom Feed"> <link rel="preconnect" href="https://www.google-analytics.com"> <link rel="preconnect" href="https://www.googletagmanager.com"> <script async src="https://www.googletagmanager.com/gtag/js?id=AW-16857946923"></script> <script>function gtag(){dataLayer.push(arguments)}window.dataLayer=window.dataLayer||[],gtag("js",new Date),gtag("config","AW-16857946923",{anonymize_ip:!0})</script> <link rel="preconnect" href="https://www.googletagmanager.com"> <script>window.dataLayer=window.dataLayer||[]</script> <script>!function(e,t,a,n,g){e[n]=e[n]||[],e[n].push({"gtm.start":(new Date).getTime(),event:"gtm.js"});var m=t.getElementsByTagName(a)[0],r=t.createElement(a);r.async=!0,r.src="https://www.googletagmanager.com/gtm.js?id=GTM-N6WMTTJ",m.parentNode.insertBefore(r,m)}(window,document,"script","dataLayer")</script> <link rel="alternate" type="application/rss+xml" href="/releases/rss.xml" title="MLflow RSS Feed"> <link rel="alternate" type="application/atom+xml" href="/releases/atom.xml" title="MLflow Atom Feed"><link rel="stylesheet" href="/assets/css/styles.3d12b415.css"> <script src="/assets/js/runtime~main.6cbf33f6.js" defer="defer"></script> <script src="/assets/js/main.970cc8a9.js" defer="defer"></script> </head> <body class="navigation-with-keyboard"> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-N6WMTTJ" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript> <script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return localStorage.getItem("theme")}catch(t){}}();t(null!==e?e:"dark")}(),function(){try{const a=new URLSearchParams(window.location.search).entries();for(var[t,e]of a)if(t.startsWith("docusaurus-data-")){var n=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(n,e)}}catch(t){}}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:#0194e2;color:#ffffff" role="banner"><div class="content_knG7 announcementBarContent_xLdY">Help us improve MLflow by taking our <a target="_blank" rel="noopener noreferrer" href="https://surveys.training.databricks.com/jfe/form/SV_cA2jrfBjs6vi6SG">survey</a>!</div></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><div class="navbar__logo"><img src="/img/mlflow-white.svg" alt="MLflow" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/img/mlflow-black.svg" alt="MLflow" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div></a></div><div class="navbar__items navbar__items--right"><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/blog">Blog</a><a class="navbar__item navbar__link" href="/releases">Releases</a><a href="/docs/latest/index.html" target="_self" rel="noopener noreferrer" class="navbar__item navbar__link">Docs</a><a href="https://github.com/mlflow/mlflow" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">Contribute<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a><a class="navbar__item navbar__link" href="/ambassador">Ambassador Program</a><a href="/docs/latest/getting-started/index.html" target="_self" rel="noopener noreferrer" class="navbar__item navbar__link navbar__item__get-started">Get Started</a><div class="navbarSearchContainer_Bca1"></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><main class="col col--9 col--offset-1" itemscope="" itemtype="https://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="https://schema.org/BlogPosting"><meta itemprop="description" content="Perform LLM Evaluations with custom metrics"><header><h1 class="title_f1Hy" itemprop="headline">LLM as judge</h1><div class="container_mt6G margin-vert--md"><time datetime="2024-10-03T00:00:00.000Z" itemprop="datePublished">October 3, 2024</time> · <!-- -->17 min read</div><div class="margin-top--md margin-bottom--sm row"><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://www.linkedin.com/in/pedro-azevedo-/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="/img/authors/pedro.png" alt="Pedro Azevedo" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://www.linkedin.com/in/pedro-azevedo-/" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Pedro Azevedo</span></a></div><small class="avatar__subtitle" itemprop="description">Machine Learning Analyst at Adidas</small></div></div></div><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://www.linkedin.com/in/rahulpandey1901/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="/img/ambassadors/Rahul_Pandey.png" alt="Rahul Pandey" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://www.linkedin.com/in/rahulpandey1901/" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Rahul Pandey</span></a></div><small class="avatar__subtitle" itemprop="description">Sr. Solutions Architect at adidas</small></div></div></div></div></header><div id="__blog-post-container" class="markdown" itemprop="articleBody"><p>In this blog post, we'll dive on a journey to revolutionize how we evaluate language models. We'll explore the power of MLflow Evaluate and harness the capabilities of Large Language Models (LLMs) as judges. By the end, you'll learn how to create custom metrics, implement LLM-based evaluation, and apply these techniques to real-world scenarios. Get ready to transform your model assessment process and gain deeper insights into your AI's performance!</p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="the-challenge-of-evaluating-language-models">The Challenge of Evaluating Language Models<a href="#the-challenge-of-evaluating-language-models" class="hash-link" aria-label="Direct link to The Challenge of Evaluating Language Models" title="Direct link to The Challenge of Evaluating Language Models"></a></h2> <p>Evaluating large language models (LLMs) and natural language processing (NLP) systems presents several challenges, primarily due to their complexity and the diversity of tasks they can perform.</p> <p>One major difficulty is creating metrics that comprehensively measure performance across varied applications, from generating coherent text to understanding nuanced human emotions. Traditional benchmarks often fail to capture these subtleties, leading to incomplete assessments.</p> <p>An LLM acting as a judge can address these issues by leveraging its extensive training data to provide a more nuanced evaluation, offering insights into model behavior and areas needing improvement. For instance, an LLM can analyze whether a model generates text that is not only grammatically correct but also contextually appropriate and engaging, something more static metrics might miss.</p> <p>However, to move forward effectively, we need more than just better evaluation methods. Standardized experimentation setups are essential to ensure that comparisons between models are both fair and replicable. A uniform framework for testing and evaluation would enable researchers to build on each other's work, leading to more consistent progress and the development of more robust models.</p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="introducing-mlflow-llm-evaluate">Introducing MLflow LLM Evaluate<a href="#introducing-mlflow-llm-evaluate" class="hash-link" aria-label="Direct link to Introducing MLflow LLM Evaluate" title="Direct link to Introducing MLflow LLM Evaluate"></a></h2> <p><a href="https://mlflow.org/docs/latest/llms/llm-evaluate/index.html" target="_blank" rel="noopener noreferrer">MLflow LLM Evaluate</a> is a powerful function within the MLflow ecosystem that allows for comprehensive model assessment by providing a standardized experiment setup. It supports both built-in metrics and custom (LLM) metrics, making it an ideal tool for evaluating complex language tasks. With <a href="https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate" target="_blank" rel="noopener noreferrer">MLflow LLM Evaluate</a>, you can:</p> <ul> <li>Evaluate models against multiple metrics simultaneously</li> <li>Use pre-defined metrics for specific model types (e.g., question-answering, text-summarization and pure text)</li> <li>Create custom metrics, including those that use LLMs as judges using <a href="https://mlflow.org/docs/latest/python_api/mlflow.metrics.html#mlflow.metrics.genai.make_genai_metric" target="_blank" rel="noopener noreferrer">mlflow.metrics.genai.make_genai_metric()</a> and <a href="https://mlflow.org/docs/latest/python_api/mlflow.metrics.html#mlflow.metrics.genai.make_genai_metric_from_prompt" target="_blank" rel="noopener noreferrer">mlflow.metrics.genai.make_genai_metric_from_prompt()</a></li> </ul> <p><img loading="lazy" alt="MLflow Evaluate" src="/assets/images/mlflow_evaluate.drawio-630de00051316c6b28dab80e5626f630.svg" width="700" height="300" class="img_ev3q"></p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="conquering-new-markets-with-an-llm-as-a-judge">Conquering new markets with an LLM as a judge<a href="#conquering-new-markets-with-an-llm-as-a-judge" class="hash-link" aria-label="Direct link to Conquering new markets with an LLM as a judge" title="Direct link to Conquering new markets with an LLM as a judge"></a></h2> <p>Imagine you're part of a global travel agency, "WorldWide Wandercorp," that's expanding its reach to Spanish-speaking countries.</p> <p>Your team has developed an AI-powered translation system to help create culturally appropriate marketing materials and customer communications. However, as you begin to use this system, you realize that traditional evaluation metrics, such as BLEU (Bilingual Evaluation Understudy), fall short in capturing the nuances of language translation, especially when it comes to preserving cultural context and idiomatic expressions.</p> <p>For instance, consider the phrase "kick the bucket." A direct translation might focus on the literal words, but the idiom actually means "to die." A traditional metric like BLEU may incorrectly evaluate the translation as adequate if the translated words match a reference translation, even if the cultural meaning is lost. In such cases, the metric might score the translation highly despite it being completely inappropriate in context. This could lead to embarrassing or culturally insensitive marketing content, which is something your team wants to avoid.</p> <p>You need a way to evaluate whether the translation not only is accurate but also preserves the intended meaning, tone, and cultural context. This is where <a href="https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate" target="_blank" rel="noopener noreferrer">MLflow Evaluate</a> and LLMs (Large Language Models) as judges come into play. These tools can assess translations more holistically by considering context, idiomatic expressions, and cultural relevance, providing a more reliable evaluation of the AI’s output.</p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="custom-metrics-tailoring-evaluation-to-your-needs">Custom Metrics: Tailoring Evaluation to Your Needs<a href="#custom-metrics-tailoring-evaluation-to-your-needs" class="hash-link" aria-label="Direct link to Custom Metrics: Tailoring Evaluation to Your Needs" title="Direct link to Custom Metrics: Tailoring Evaluation to Your Needs"></a></h2> <p>In the following section, we’ll implement three metrics:</p> <ul> <li>The <code>"cultural_sensitivity"</code> metric ensures translations maintain cultural context and appropriateness.</li> <li>The <code>"faithfulness"</code> metric checks that chatbot responses align accurately with company policies and retrieved content.</li> <li>The <code>"toxicity"</code> metric evaluates responses for harmful or inappropriate content, ensuring respectful customer interactions.</li> </ul> <p>These metrics will help Worldwide WanderAgency ensure their AI-driven translations and interactions meet their specific needs.</p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="evaluating-worldwide-wanderagencys-ai-systems">Evaluating Worldwide WanderAgency's AI Systems<a href="#evaluating-worldwide-wanderagencys-ai-systems" class="hash-link" aria-label="Direct link to Evaluating Worldwide WanderAgency's AI Systems" title="Direct link to Evaluating Worldwide WanderAgency's AI Systems"></a></h2> <p>Now that we understand WanderAgency's challenges, let's dive into a code walkthrough to address them. We'll implement custom metrics to measure AI performance and build a gauge visualization chart for sharing results with stakeholders.</p> <p>We'll start by evaluating a language translation model, focusing on the "cultural_sensitivity" metric to ensure it preserves cultural nuances. This will help WanderAgency maintain high standards in global communication.</p> <h3 class="anchor anchorWithStickyNavbar_LWe7" id="cultural-sensitivity-metric">Cultural Sensitivity Metric<a href="#cultural-sensitivity-metric" class="hash-link" aria-label="Direct link to Cultural Sensitivity Metric" title="Direct link to Cultural Sensitivity Metric"></a></h3> <p>The travel agency wants to ensure their translations are not only accurate but also culturally appropriate. To achieve this they are considering creating a custom metric that allows Worldwide WanderAgency to quantify how well their translations maintain cultural context and idiomatic expressions.</p> <p>For instance, a phrase that is polite in one culture might be inappropriate in another. In English, addressing someone as "Dear" in a professional email might be seen as polite. However, in Spanish, using "Querido" in a professional context can be too personal and inappropriate.</p> <p>How can we evaluate such an abstract concept in a systematic way? Traditional Metrics would fall short so we need a better way of doing it. In this case LLM as a judge would be a great fit! For this use case let's create a "cultural_sensitivity" metric.</p> <p>Here's a brief overview of the process: Start by installing all the necessary libraries for this demo to work.</p> <div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token plain">pip install mlflow>=2.14.1 openai transformers torch torchvision evaluate datasets tiktoken fastapi rouge_score textstat tenacity plotly ipykernel nbformat>=5.10.4</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>We will be using gpt3.5 and gpt4 during this example for that let's start by making sure our <a href="https://mlflow.org/docs/latest/llms/openai/notebooks/openai-quickstart.html#API-Key-Security-Overview" target="_blank" rel="noopener noreferrer">OpenAI key is setup</a>.</p> <p>Import the necessary libraries.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token keyword" style="color:rgb(86, 156, 214)">import</span><span class="token plain"> mlflow</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">import</span><span class="token plain"> os</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token comment" style="color:rgb(106, 153, 85)"># Run a quick validation that we have an entry for the OPEN_API_KEY within environment variables</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">assert</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"OPENAI_API_KEY"</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(86, 156, 214)">in</span><span class="token plain"> os</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">environ</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"OPENAI_API_KEY environment variable must be set"</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">import</span><span class="token plain"> openai</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">import</span><span class="token plain"> pandas </span><span class="token keyword" style="color:rgb(86, 156, 214)">as</span><span class="token plain"> pd</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>When using the <a href="https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate" target="_blank" rel="noopener noreferrer"><code>mlflow.evaluate()</code></a> function, your large language model (LLM) can take one of the following forms:</p> <ol> <li>A <code>mlflow.pyfunc.PyFuncModel()</code> — typically an MLflow model.</li> <li>A Python function that accepts strings as inputs and returns a single string as output.</li> <li>An <code>MLflow Deployments</code> endpoint URI.</li> <li><code>model=None</code> if the data you are providing has already been scored by a model, and you do not need to specify one.</li> </ol> <p>For this example, we will use an MLflow model.</p> <p>We’ll begin by logging a translation model in MLflow. For this tutorial, we'll use GPT-3.5 with a defined system prompt.</p> <p>In a production environment, you would typically experiment with different prompts and models to determine the most suitable configuration for your use case. For more details, refer to MLflow’s <a href="https://mlflow.org/docs/latest/llms/prompt-engineering/index.html" target="_blank" rel="noopener noreferrer">Prompt Engineering UI</a>.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">system_prompt </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"Translate the following sentences into Spanish"</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token comment" style="color:rgb(106, 153, 85)"># Let's set up an experiment to make it easier to track our results</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">set_experiment</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token string" style="color:rgb(206, 145, 120)">"/Path/to/your/experiment"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">basic_translation_model </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">openai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">log_model</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> model</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"gpt-3.5-turbo"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> task</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain">openai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">chat</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">completions</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> artifact_path</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"model"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> messages</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">"role"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"system"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"content"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> system_prompt</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">"role"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"user"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"content"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"{user_input}"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>Let's test the model to make sure it works.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token plain">model </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">pyfunc</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">load_model</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">basic_translation_model</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">model_uri</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">model</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">predict</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token string" style="color:rgb(206, 145, 120)">"Hello, how are you?"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token comment" style="color:rgb(106, 153, 85)"># Output = ['¡Hola, ¿cómo estás?']</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>To use <a href="https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate" target="_blank" rel="noopener noreferrer"><code>mlflow.evaluate()</code></a>, we first need to prepare sample data that will serve as input to our LLM. In this scenario, the input would consist of the content the company is aiming to translate.</p> <p>For demonstration purposes, we will define a set of common English expressions that we want the model to translate.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token comment" style="color:rgb(106, 153, 85)"># Prepare evaluation data</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">eval_data </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> pd</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">DataFrame</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"llm_inputs"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"I'm over the moon about the news!"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"Spill the beans."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"Bite the bullet."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"Better late than never."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>To meet the objectives of the travel agency, we will define custom metrics that evaluate the quality of translations. In particular, we need to assess how faithfully the translations capture not only the literal meaning but also cultural nuances.</p> <p>By default, <a href="https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate" target="_blank" rel="noopener noreferrer"><code>mlflow.evaluate()</code></a> uses <code>openai:/gpt-4</code> as the evaluation model. However, you also have the option to use a <a href="https://mlflow.org/docs/latest/llms/llm-evaluate/index.html#selecting-the-llm-as-judge-model" target="_blank" rel="noopener noreferrer">local model for evaluation</a>, such as a model wrapped in a PyFunc (e.g., Ollama).</p> <p>For this example, we will use GPT-4 as the evaluation model.</p> <p>To begin, provide a few examples that illustrate good and poor translation scores.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token comment" style="color:rgb(106, 153, 85)"># Define the custom metric</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">cultural_sensitivity </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">make_genai_metric</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> name</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"cultural_sensitivity"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> definition</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Assesses how well the translation preserves cultural nuances and idioms."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> grading_prompt</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Score from 1-5, where 1 is culturally insensitive and 5 is highly culturally aware."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> examples</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Break a leg!"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"¡Rómpete una pierna!"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">2</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This is a literal translation that doesn't capture the idiomatic meaning."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Break a leg!"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"¡Mucha mierda!"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">5</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This translation uses the equivalent Spanish theater idiom, showing high cultural awareness."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"It's raining cats and dogs."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Está lloviendo gatos y perros."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This literal translation does not convey the idiomatic meaning of heavy rain."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"It's raining cats and dogs."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Está lloviendo a cántaros."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">5</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This translation uses a Spanish idiom that accurately conveys the meaning of heavy rain."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Kick the bucket."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Patear el balde."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This literal translation fails to convey the idiomatic meaning of dying."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Kick the bucket."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Estirar la pata."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">5</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This translation uses the equivalent Spanish idiom for dying, showing high cultural awareness."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Once in a blue moon."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Una vez en una luna azul."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">2</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This literal translation does not capture the rarity implied by the idiom."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Once in a blue moon."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"De vez en cuando."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">4</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This translation captures the infrequency but lacks the idiomatic color of the original."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"The ball is in your court."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"La pelota está en tu cancha."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">3</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This translation is understandable but somewhat lacks the idiomatic nuance of making a decision."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"The ball is in your court."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Te toca a ti."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">5</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"This translation accurately conveys the idiomatic meaning of it being someone else's turn to act."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> model</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"openai:/gpt-4"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> parameters</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">"temperature"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token number" style="color:rgb(181, 206, 168)">0.0</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <h3 class="anchor anchorWithStickyNavbar_LWe7" id="the-toxicity-metric">The Toxicity Metric<a href="#the-toxicity-metric" class="hash-link" aria-label="Direct link to The Toxicity Metric" title="Direct link to The Toxicity Metric"></a></h3> <p>In addition to this custom metric let's use MLflow built-in metrics for the evaluators. In this case MLflow wll use roberta-hate-speech model to detect the <a href="https://huggingface.co/spaces/evaluate-measurement/toxicity" target="_blank" rel="noopener noreferrer">toxicity</a>. This metric evaluates responses for any harmful or inappropriate content, reinforcing the company's commitment to a positive customer experience.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token comment" style="color:rgb(106, 153, 85)"># Log and evaluate the model</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">with</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">start_run</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(86, 156, 214)">as</span><span class="token plain"> run</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> results </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">evaluate</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> basic_translation_model</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">model_uri</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> data</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain">eval_data</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> model_type</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"text"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> evaluators</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"default"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> extra_metrics</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token plain">cultural_sensitivity</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> evaluator_config</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"col_mapping"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"inputs"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"llm_inputs"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">end_run</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>You can retrieve the final results as such:</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token plain">results</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">tables</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token string" style="color:rgb(206, 145, 120)">"eval_results_table"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <table><thead><tr><th></th><th>llm_inputs</th><th>outputs</th><th>token_count</th><th>toxicity/v1/score</th><th>flesch_kincaid_grade_level/v1/score</th><th>ari_grade_level/v1/score</th><th>cultural_sensitivity/v1/score</th><th>cultural_sensitivity/v1/justification</th></tr></thead><tbody><tr><td>0</td><td>I'm over the moon about the news!</td><td>¡Estoy feliz por la noticia!</td><td>9</td><td>0.000258</td><td>5.2</td><td>3.7</td><td>4</td><td>The translation captures the general sentiment...</td></tr><tr><td>1</td><td>Spill the beans.</td><td>Revela el secreto.</td><td>7</td><td>0.001017</td><td>9.2</td><td>5.2</td><td>5</td><td>The translation accurately captures the idioma...</td></tr><tr><td>2</td><td>Bite the bullet.</td><td>Morder la bala.</td><td>7</td><td>0.001586</td><td>0.9</td><td>3.6</td><td>2</td><td>The translation "Morder la bala" is a litera...</td></tr><tr><td>3</td><td>Better late than never.</td><td>Más vale tarde que nunca.</td><td>7</td><td>0.004947</td><td>0.5</td><td>0.9</td><td>5</td><td>The translation accurately captures the idioma...</td></tr></tbody></table> <p>Let's analyze the final metrics...</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token plain">cultural_sensitivity_score </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> results</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token string" style="color:rgb(206, 145, 120)">'cultural_sensitivity/v1/mean'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">print</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token string-interpolation string" style="color:rgb(206, 145, 120)">f"Cultural Sensitivity Score: </span><span class="token string-interpolation interpolation punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string-interpolation interpolation">cultural_sensitivity_score</span><span class="token string-interpolation interpolation punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token string-interpolation string" style="color:rgb(206, 145, 120)">"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">toxicity_score </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> results</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token string" style="color:rgb(206, 145, 120)">'toxicity/v1/mean'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token comment" style="color:rgb(106, 153, 85)"># Calculate non-toxicity score</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">non_toxicity_score </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"{:.2f}"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token builtin" style="color:rgb(86, 156, 214)">format</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token number" style="color:rgb(181, 206, 168)">1</span><span class="token plain"> </span><span class="token operator" style="color:rgb(212, 212, 212)">-</span><span class="token plain"> toxicity_score</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"> </span><span class="token operator" style="color:rgb(212, 212, 212)">*</span><span class="token plain"> </span><span class="token number" style="color:rgb(181, 206, 168)">100</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">print</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token string-interpolation string" style="color:rgb(206, 145, 120)">f"Non-Toxicity Score: </span><span class="token string-interpolation interpolation punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string-interpolation interpolation">non_toxicity_score</span><span class="token string-interpolation interpolation punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token string-interpolation string" style="color:rgb(206, 145, 120)">%"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>Output:</p> <div class="language-bash codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-bash codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token plain">Cultural Sensitivity Score: 3.75</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">Pureness Score: 99.80</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>It is often the case we want to monitor and track these metrics on a dashboard so both data scientists and stakeholders have an understanding of the performance and reliability of these solutions.</p> <p>For this example let's create a gauge to display the final metric.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token keyword" style="color:rgb(86, 156, 214)">import</span><span class="token plain"> plotly</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">graph_objects </span><span class="token keyword" style="color:rgb(86, 156, 214)">as</span><span class="token plain"> go</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">from</span><span class="token plain"> plotly</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">subplots </span><span class="token keyword" style="color:rgb(86, 156, 214)">import</span><span class="token plain"> make_subplots</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(220, 220, 170)">create_gauge_chart</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">value1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> title1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> value2</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> title2</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token comment" style="color:rgb(106, 153, 85)"># Create a subplot figure with two columns</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> fig </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> make_subplots</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">rows</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> cols</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">2</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> specs</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">'type'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">'indicator'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">'type'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">'indicator'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token comment" style="color:rgb(106, 153, 85)"># Add the first gauge chart</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> fig</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">add_trace</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">go</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">Indicator</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mode </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"gauge+number"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> value </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> value1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> title </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">'text'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> title1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> gauge </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">'axis'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">'range'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token boolean">None</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token number" style="color:rgb(181, 206, 168)">5</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> row</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> col</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token comment" style="color:rgb(106, 153, 85)"># Add the second gauge chart</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> fig</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">add_trace</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">go</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">Indicator</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mode </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"gauge+number"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> value </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> value2</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> title </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">'text'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> title2</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> gauge </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">'axis'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">'range'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token boolean">None</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token number" style="color:rgb(181, 206, 168)">100</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> row</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">1</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> col</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">2</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token comment" style="color:rgb(106, 153, 85)"># Update layout</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> fig</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">update_layout</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">height</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">400</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> width</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">800</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token comment" style="color:rgb(106, 153, 85)"># Show figure</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> fig</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">show</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token plain">create_gauge_chart</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">cultural_sensitive_score</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"Cultural Sensitivity Score"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">float</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">non_toxicity_score</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"Non Toxicity Score"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p><img loading="lazy" alt="Gauge Chart" src="/assets/images/gauge-c21a7facbde6cfee6d102ac805057db2.png" width="800" height="400" class="img_ev3q"></p> <h3 class="anchor anchorWithStickyNavbar_LWe7" id="the-faithfulness-metric">The Faithfulness Metric<a href="#the-faithfulness-metric" class="hash-link" aria-label="Direct link to The Faithfulness Metric" title="Direct link to The Faithfulness Metric"></a></h3> <p>As Worldwide WanderAgency's AI grows, they add a customer service chatbot that handles questions in multiple languages. This chatbot uses a RAG (Retrieval-Augmented Generation) system, which means it retrieves information from a database or documents and then generates an answer based on that information.</p> <p>It's important that the answers provided by the chatbot stay true to the information it retrieves. To make sure of this, we create a "faithfulness" metric. This metric checks how well the chatbot's responses match the materials it’s supposed to be based on, ensuring the information given to customers is accurate.</p> <p>For example, If the retrieved document says "Returns are accepted within 30 days," and the chatbot replies with "Our return policy is flexible and varies by region," it is not aligning well with the retrieved material. This inaccurate response (bad faithfulness) could mislead customers and create confusion.</p> <h3 class="anchor anchorWithStickyNavbar_LWe7" id="using-mlflow-to-evaluate-rag---faithfulness">Using MLflow to Evaluate RAG - Faithfulness<a href="#using-mlflow-to-evaluate-rag---faithfulness" class="hash-link" aria-label="Direct link to Using MLflow to Evaluate RAG - Faithfulness" title="Direct link to Using MLflow to Evaluate RAG - Faithfulness"></a></h3> <p>Let's evaluate how well our chatbot is doing in sticking to the retrieved information. Instead of using an MLflow model this time, we’ll use a custom function to define the faithfulness metric and see how aligned the chatbot's answers are with the data it pulls from.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token comment" style="color:rgb(106, 153, 85)"># Prepare evaluation data</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">eval_data </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> pd</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">DataFrame</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"llm_inputs"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">"""Question: What is the company's policy on employee training?</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">context: "Our company offers various training programs to support employee development. Employees are required to complete at least one training course per year related to their role. Additional training opportunities are available based on performance reviews." """</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">"""Question: What is the company's policy on sick leave?</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">context: "Employees are entitled to 10 days of paid sick leave per year. Sick leave can be used for personal illness or to care for an immediate family member. A doctor's note is required for sick leave exceeding three consecutive days." """</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">"""Question: How does the company handle performance reviews?</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">context: "Performance reviews are conducted annually. Employees are evaluated based on their job performance, goal achievement, and overall contribution to the team. Feedback is provided, and development plans are created to support employee growth." """</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>Now let's define some examples for this faithfulness metric.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token plain">examples </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">"""Question: What is the company's policy on remote work?</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">context: "Our company supports a flexible working environment. Employees can work remotely up to three days a week, provided they maintain productivity and attend all mandatory meetings." """</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Employees can work remotely up to three days a week if they maintain productivity and attend mandatory meetings."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">5</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"The answer is accurate and directly related to the question and context provided."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">"""Question: What is the company's policy on remote work?</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">context: "Our company supports a flexible working environment. Employees can work remotely up to three days a week, provided they maintain productivity and attend all mandatory meetings." """</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Employees are allowed to work remotely as long as they want."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">2</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"The answer is somewhat related but incorrect because it does not mention the three-day limit."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">"""Question: What is the company's policy on remote work?</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">context: "Our company supports a flexible working environment. Employees can work remotely up to three days a week, provided they maintain productivity and attend all mandatory meetings." """</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Our company supports flexible work arrangements."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">3</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"The answer is related to the context but does not specifically answer the question about the remote work policy."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token builtin" style="color:rgb(86, 156, 214)">input</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">"""Question: What is the company's annual leave policy?</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token triple-quoted-string string" style="color:rgb(206, 145, 120)">context: "Employees are entitled to 20 days of paid annual leave per year. Leave must be approved by the employee's direct supervisor and should be planned in advance to ensure minimal disruption to work." """</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> output</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Employees are entitled to 20 days of paid annual leave per year, which must be approved by their supervisor."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> score</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token number" style="color:rgb(181, 206, 168)">5</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> justification</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"The answer is accurate and directly related to the question and context provided."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token comment" style="color:rgb(106, 153, 85)"># Define the custom metric</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">faithfulness </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">make_genai_metric</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> name</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"faithfulness"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> definition</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Assesses how well the answer relates to the question and provided context."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> grading_prompt</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"Score from 1-5, where 1 is not related at all and 5 is highly relevant and accurate."</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> examples</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain">examples</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>Define out LLM function (in this case it can be any function that follows certain input/output formats that <a href="https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate" target="_blank" rel="noopener noreferrer"><code>mlflow.evaluate()</code></a>).</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token comment" style="color:rgb(106, 153, 85)"># Using custom function</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">def</span><span class="token plain"> </span><span class="token function" style="color:rgb(220, 220, 170)">my_llm</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">inputs</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> answers </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> system_prompt </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"Please answer the following question in formal language based on the context provided."</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token keyword" style="color:rgb(86, 156, 214)">for</span><span class="token plain"> index</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> row </span><span class="token keyword" style="color:rgb(86, 156, 214)">in</span><span class="token plain"> inputs</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">iterrows</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token keyword" style="color:rgb(86, 156, 214)">print</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token string" style="color:rgb(206, 145, 120)">'INPUTS:'</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> row</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> completion </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> openai</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">chat</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">completions</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">create</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> model</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"gpt-3.5-turbo"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> messages</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">"role"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"system"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"content"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> system_prompt</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string" style="color:rgb(206, 145, 120)">"role"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"user"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"content"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string-interpolation string" style="color:rgb(206, 145, 120)">f"</span><span class="token string-interpolation interpolation punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token string-interpolation interpolation">row</span><span class="token string-interpolation interpolation punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token string-interpolation string" style="color:rgb(206, 145, 120)">"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> answers</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">append</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">completion</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">choices</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token number" style="color:rgb(181, 206, 168)">0</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">message</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">content</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token keyword" style="color:rgb(86, 156, 214)">return</span><span class="token plain"> answers</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>Resulting in a code that is similar to what we did before...</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token keyword" style="color:rgb(86, 156, 214)">with</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">start_run</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(86, 156, 214)">as</span><span class="token plain"> run</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> results </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">evaluate</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> my_llm</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> eval_data</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> model_type</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"text"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> evaluators</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"default"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> extra_metrics</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token plain">faithfulness</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> evaluator_config</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"col_mapping"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"inputs"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"llm_inputs"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">end_run</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <h3 class="anchor anchorWithStickyNavbar_LWe7" id="genai-metrics">GenAI Metrics<a href="#genai-metrics" class="hash-link" aria-label="Direct link to GenAI Metrics" title="Direct link to GenAI Metrics"></a></h3> <p>Alternatively, we can leverage MLflow's built-in metrics for generative AI, using the same examples.</p> <p>MLflow provides several <a href="https://mlflow.org/docs/latest/python_api/mlflow.metrics.html?highlight=genai%20answer#generative-ai-metrics" target="_blank" rel="noopener noreferrer">built-in metrics</a> that use an LLM as a judge. Despite differences in implementation, these metrics are used in the same way. Simply include them in the <code>extra_metrics</code> argument of the <a href="https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate" target="_blank" rel="noopener noreferrer"><code>mlflow.evaluate()</code></a> function.</p> <p>In this case, we will use MLflow’s built-in <a href="https://mlflow.org/docs/latest/python_api/mlflow.metrics.html?highlight=genai%20answer#mlflow.metrics.genai.faithfulness" target="_blank" rel="noopener noreferrer">faithfulness metric</a>.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token keyword" style="color:rgb(86, 156, 214)">from</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">metrics</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">genai </span><span class="token keyword" style="color:rgb(86, 156, 214)">import</span><span class="token plain"> EvaluationExample</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"> faithfulness</span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">faithfulness_metric </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> faithfulness</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">model</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"openai:/gpt-4"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"></span><span class="token keyword" style="color:rgb(86, 156, 214)">print</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain">faithfulness_metric</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p><a href="https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate" target="_blank" rel="noopener noreferrer"><code>mlflow.evaluate()</code></a> simplifies the process of providing grading context, such as the documents retrieved by our system, directly into the evaluation. This feature integrates seamlessly with <a href="https://python.langchain.com/docs/concepts/#retrievers" target="_blank" rel="noopener noreferrer">LangChain's retrievers</a>, allowing you to supply the context for evaluation as a dedicated column. For more details, refer to <a href="https://mlflow.org/docs/latest/llms/llm-evaluate/notebooks/rag-evaluation-llama2.html" target="_blank" rel="noopener noreferrer">this example</a>.</p> <p>In this case, since our retrieved documents are already included within the final prompt and we are not leveraging LangChain for this tutorial, we will simply map the <code>llm_input</code> column as our grading context.</p> <div class="language-python codeBlockContainer_Ckt0 theme-code-block" style="--prism-color:#9CDCFE;--prism-background-color:#1E1E1E"><div class="codeBlockContent_biex"><pre tabindex="0" class="prism-code language-python codeBlock_bY9V thin-scrollbar" style="color:#9CDCFE;background-color:#1E1E1E"><code class="codeBlockLines_e6Vv"><span class="token-line" style="color:#9CDCFE"><span class="token keyword" style="color:rgb(86, 156, 214)">with</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">start_run</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"> </span><span class="token keyword" style="color:rgb(86, 156, 214)">as</span><span class="token plain"> run</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> results </span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token plain"> mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">evaluate</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> my_llm</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> eval_data</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> model_type</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"text"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> evaluators</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token string" style="color:rgb(206, 145, 120)">"default"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> extra_metrics</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">[</span><span class="token plain">faithfulness_metric</span><span class="token punctuation" style="color:rgb(212, 212, 212)">]</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> evaluator_config</span><span class="token operator" style="color:rgb(212, 212, 212)">=</span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"col_mapping"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">{</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"inputs"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"llm_inputs"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"context"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">:</span><span class="token plain"> </span><span class="token string" style="color:rgb(206, 145, 120)">"llm_inputs"</span><span class="token punctuation" style="color:rgb(212, 212, 212)">,</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token punctuation" style="color:rgb(212, 212, 212)">}</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain"> </span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><span class="token plain"></span><br></span><span class="token-line" style="color:#9CDCFE"><span class="token plain">mlflow</span><span class="token punctuation" style="color:rgb(212, 212, 212)">.</span><span class="token plain">end_run</span><span class="token punctuation" style="color:rgb(212, 212, 212)">(</span><span class="token punctuation" style="color:rgb(212, 212, 212)">)</span><br></span></code></pre><div class="buttonGroup__atx"><button type="button" aria-label="Copy code to clipboard" title="Copy" class="clean-btn"><span class="copyButtonIcons_eSgA" aria-hidden="true"><svg viewBox="0 0 24 24" class="copyButtonIcon_y97N"><path fill="currentColor" d="M19,21H8V7H19M19,5H8A2,2 0 0,0 6,7V21A2,2 0 0,0 8,23H19A2,2 0 0,0 21,21V7A2,2 0 0,0 19,5M16,1H4A2,2 0 0,0 2,3V17H4V3H16V1Z"></path></svg><svg viewBox="0 0 24 24" class="copyButtonSuccessIcon_LjdS"><path fill="currentColor" d="M21,7L9,19L3.5,13.5L4.91,12.09L9,16.17L19.59,5.59L21,7Z"></path></svg></span></button></div></div></div> <p>After the evaluation we get the following results: <img loading="lazy" alt="Gauge faithfulness Chart" src="/assets/images/faithfulness-246906b0fa032bfe3bed8cdee648d7bf.png" width="800" height="400" class="img_ev3q"></p> <h2 class="anchor anchorWithStickyNavbar_LWe7" id="conclusion">Conclusion<a href="#conclusion" class="hash-link" aria-label="Direct link to Conclusion" title="Direct link to Conclusion"></a></h2> <p>By combining the Cultural Sensitivity score with our other calculated metrics, our travel agency can further refine its model to ensure the delivery of high-quality content across all languages. Moving forward, we can revisit and adjust the prompts used to boost our Cultural Sensitivity score. Alternatively, we could fine-tune a smaller model to maintain the same high level of cultural sensitivity while reducing costs. These steps will help us provide even better service to the agency's diverse customer base.</p> <p><a href="https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.evaluate" target="_blank" rel="noopener noreferrer"><code>mlflow.evaluate()</code></a>, combined with LLMs as judges, opens up new possibilities for nuanced and context-aware model evaluation. By creating custom metrics tailored to specific aspects of model performance, data scientists can gain deeper insights into their models' strengths and weaknesses.</p> <p>The flexibility offered by <code>make_genai_metric()</code> allows you to create evaluation criteria that are perfectly suited to your specific use case. Whether you need structured guidance for your LLM judge or want full control over the prompting process, MLflow provides the tools you need.</p> <p>As you explore MLflow evaluate and LLM-based metrics, remember that the key lies in designing thoughtful evaluation criteria and providing clear instructions to your LLM judge. With these tools at your disposal, you're well-equipped to take your model evaluation to the next level, ensuring that your language models not only perform well on traditional metrics but also meet the nuanced requirements of real-world applications.</p> <p>The built-in metrics, such as toxicity, offer standardized assessments that are crucial for ensuring the safety and accessibility of model outputs.</p> <p>As a final challenge, re-run all the tests performed but this time with "gpt-4o-mini" and see how the performance is affected.</p></div><footer class="row docusaurus-mt-lg blogPostFooterDetailsFull_mRVl"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/genai">genai</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/mlflow-evalaute">mlflow-evalaute</a></li></ul></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/blog/mlflow-llama-index-workflow"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">Building Advanced RAG with MLflow and LlamaIndex Workflow</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/blog/models_from_code"><div class="pagination-nav__sublabel">Older Post</div><div class="pagination-nav__label">Models from Code Logging in MLflow - What, Why, and How</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#the-challenge-of-evaluating-language-models" class="table-of-contents__link toc-highlight">The Challenge of Evaluating Language Models</a></li><li><a href="#introducing-mlflow-llm-evaluate" class="table-of-contents__link toc-highlight">Introducing MLflow LLM Evaluate</a></li><li><a href="#conquering-new-markets-with-an-llm-as-a-judge" class="table-of-contents__link toc-highlight">Conquering new markets with an LLM as a judge</a></li><li><a href="#custom-metrics-tailoring-evaluation-to-your-needs" class="table-of-contents__link toc-highlight">Custom Metrics: Tailoring Evaluation to Your Needs</a></li><li><a href="#evaluating-worldwide-wanderagencys-ai-systems" class="table-of-contents__link toc-highlight">Evaluating Worldwide WanderAgency's AI Systems</a><ul><li><a href="#cultural-sensitivity-metric" class="table-of-contents__link toc-highlight">Cultural Sensitivity Metric</a></li><li><a href="#the-toxicity-metric" class="table-of-contents__link toc-highlight">The Toxicity Metric</a></li><li><a href="#the-faithfulness-metric" class="table-of-contents__link toc-highlight">The Faithfulness Metric</a></li><li><a href="#using-mlflow-to-evaluate-rag---faithfulness" class="table-of-contents__link toc-highlight">Using MLflow to Evaluate RAG - Faithfulness</a></li><li><a href="#genai-metrics" class="table-of-contents__link toc-highlight">GenAI Metrics</a></li></ul></li><li><a href="#conclusion" class="table-of-contents__link toc-highlight">Conclusion</a></li></ul></div></div></div></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><a class="footerLogoLink_DDai" href="/"><img src="/img/mlflow-white.svg" alt="MLflow" class="footer__logo themedComponent_mlkZ themedComponent--light_NVdE" width="200px"><img src="/img/mlflow-black.svg" alt="MLflow" class="footer__logo themedComponent_mlkZ themedComponent--dark_xIcU" width="200px"></a></div><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://stackoverflow.com/questions/tagged/mlflow" target="_blank" rel="noopener noreferrer" class="footer__link-item">Stack Overflow<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://www.linkedin.com/company/mlflow-org" target="_blank" rel="noopener noreferrer" class="footer__link-item">LinkedIn<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://x.com/mlflow" target="_blank" rel="noopener noreferrer" class="footer__link-item">X<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><div class="col footer__col"><div class="footer__title">Resources</div><ul class="footer__items clean-list"><li class="footer__item"><a href="/docs/latest/index.html" target="_blank" rel="noopener noreferrer" class="footer__link-item">Docs</a></li><li class="footer__item"><a class="footer__link-item" href="/releases">Releases</a></li><li class="footer__item"><a class="footer__link-item" href="/blog">Blog</a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">© 2025 MLflow Project, a Series of LF Projects, LLC.</div></div></div></footer></div> </body> </html>