CINXE.COM

RAGの検索性能を90%も低下させるテキストの落とし穴

<!DOCTYPE html><html lang="ja"><head><meta charSet="utf-8"/><meta content="width=device-width, initial-scale=1" name="viewport"/><title>RAGの検索性能を90%も低下させるテキストの落とし穴</title><link rel="canonical" href="https://zenn.dev/knowledgesense/articles/ff2c528acf6b04"/><meta name="twitter:card" content="summary_large_image"/><meta property="og:url" content="https://zenn.dev/knowledgesense/articles/ff2c528acf6b04"/><meta property="og:title" content="RAGの検索性能を90%も低下させるテキストの落とし穴"/><meta property="og:image" content="https://res.cloudinary.com/zenn/image/upload/s--1-ytxo17--/c_fit%2Cg_north_west%2Cl_text:notosansjp-medium.otf_55:RAG%25E3%2581%25AE%25E6%25A4%259C%25E7%25B4%25A2%25E6%2580%25A7%25E8%2583%25BD%25E3%2582%259290%25EF%25BC%2585%25E3%2582%2582%25E4%25BD%258E%25E4%25B8%258B%25E3%2581%2595%25E3%2581%259B%25E3%2582%258B%25E3%2583%2586%25E3%2582%25AD%25E3%2582%25B9%25E3%2583%2588%25E3%2581%25AE%25E8%2590%25BD%25E3%2581%25A8%25E3%2581%2597%25E7%25A9%25B4%2Cw_1010%2Cx_90%2Cy_100/g_south_west%2Cl_text:notosansjp-medium.otf_34:sasakuna%2Cx_220%2Cy_108/bo_3px_solid_rgb:d6e3ed%2Cg_south_west%2Ch_90%2Cl_fetch:aHR0cHM6Ly96ZW5uLmRldi9pbWFnZXMvZGVmYXVsdC1wdWJsaWNhdGlvbi1hdmF0YXIucG5n%2Cr_20%2Cw_90%2Cx_92%2Cy_102/co_rgb:6e7b85%2Cg_south_west%2Cl_text:notosansjp-medium.otf_30:%25E3%2583%258A%25E3%2583%25AC%25E3%2583%2583%25E3%2582%25B8%25E3%2582%25BB%25E3%2583%25B3%25E3%2582%25B9%2520-%2520AI%25E7%259F%25A5%25E8%25A6%258B%25E5%2585%25B1%25E6%259C%2589%25E3%2583%2596%25E3%2583%25AD%25E3%2582%25B0%2Cx_220%2Cy_160/bo_4px_solid_white%2Cg_south_west%2Ch_50%2Cl_fetch:aHR0cHM6Ly9zdG9yYWdlLmdvb2dsZWFwaXMuY29tL3plbm4tdXNlci11cGxvYWQvYXZhdGFyL2ZkMzU2MDA4YWUuanBlZw==%2Cr_max%2Cw_50%2Cx_139%2Cy_84/v1627283836/default/og-base-w1200-v2.png"/><meta property="og:type" content="article"/><meta property="og:site_name" content="Zenn"/><meta content="https://storage.googleapis.com/zenn-user-upload/avatar/fd356008ae.jpeg" name="zenn:image"/><meta content="sasakunaさんによる記事" name="zenn:description"/><meta name="next-head-count" content="12"/><script nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=">var theme = localStorage.getItem('theme') || 'light'; window.document.documentElement.dataset.theme = theme;</script><script nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=">var newDataLayer = {}; // display_modeを追加 if (window.matchMedia('(display-mode: standalone)').matches) { newDataLayer.display_mode = 'standalone'; // PWA } else if (window.navigator.standalone === true) { newDataLayer.display_mode = 'standalone'; // PWA on iOS Safari } else { newDataLayer.display_mode = 'browser'; // Web } // user_idを追加 // ページ初期化時に取得したいため、ローカルストレージからcachedUser.idを取得する var zennCurrentUserValue = localStorage.getItem('zenn_current_user'); if (zennCurrentUserValue) { try { var currentUser = JSON.parse(zennCurrentUserValue); if (currentUser.cachedUser) { newDataLayer.user_id = currentUser.cachedUser.id; } } catch {} } window.dataLayer = window.dataLayer || []; window.dataLayer.push(newDataLayer);</script><script nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=">(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= 'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-K42DRM8');</script><script async="" src="https://www.googletagmanager.com/gtag/js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo="></script><script nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=">window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag('js', new Date());</script><script src="https://embed.zenn.studio/js/listen-embed-event.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo="></script><style nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo="> @font-face { font-family: 'Inter'; font-weight: 600; font-display: swap; src: local(''), url('https://static.zenn.studio/fonts/inter-v3-latin-600.woff2') format('woff2'); } @font-face { font-family: 'Inter'; font-weight: 700; font-display: swap; src: local(''), url('https://static.zenn.studio/fonts/inter-v3-latin-700.woff2') format('woff2'); }</style><meta content="Zenn" name="apple-mobile-web-app-title"/><link href="/manifest.json" rel="manifest"/><link href="https://static.zenn.studio/images/logo-transparent.png" rel="shortcut icon" type="image/png"/><link href="https://static.zenn.studio/images/icon.png" rel="apple-touch-icon-precomposed" type="image/png"/><link nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" rel="preload" href="https://static.zenn.studio/_next/static/css/a6c53c418ae5538e.css" as="style"/><link nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" rel="stylesheet" href="https://static.zenn.studio/_next/static/css/a6c53c418ae5538e.css" data-n-g=""/><link nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" rel="preload" href="https://static.zenn.studio/_next/static/css/d118ea5137a96514.css" as="style"/><link nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" rel="stylesheet" href="https://static.zenn.studio/_next/static/css/d118ea5137a96514.css" data-n-p=""/><noscript data-n-css="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo="></noscript><script defer="" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" nomodule="" src="https://static.zenn.studio/_next/static/chunks/polyfills-42372ed130431b0a.js"></script><script src="https://static.zenn.studio/_next/static/chunks/webpack-473483b77a5d4755.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/framework-6603b6fce1ea64cf.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/main-2fb7d839e5bb90ea.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/pages/_app-3f11decb4f1a7920.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/5625-25fd02b61ddb8be0.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/327-4a46c0c9d0f704da.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/755-bff961df2e6f1d32.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/2673-214c47a84d31dc38.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/3008-d4936fa749de4c75.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/4779-fd0d2d1f9ae5b285.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/chunks/pages/%5Busername%5D/articles/%5Bslug%5D-7b7a5f841c87f94b.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/K0KfDsWWmRmIshdNoQaRg/_buildManifest.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script><script src="https://static.zenn.studio/_next/static/K0KfDsWWmRmIshdNoQaRg/_ssgManifest.js" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=" defer=""></script></head><body><script nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=">const shouldUseTwemoji = !/(googlebot|macintosh|macintel|macppc|mac68k|macos|iphone|ipad)/i.test(window.navigator.userAgent); if(shouldUseTwemoji) document.body.setAttribute("data-use-twemoji", "true");</script><div id="__next"><header class="AppHeader_header__54XdE"><div class="Container_wide__ykGLh Container_common__figYY"><div class="AppHeader_inner__CJC4C"><a class="ZennLogo_logoLink__NJS2l" href="/"><svg x="0px" y="0px" viewBox="0 0 377.4 88.3" aria-label="Zenn | エンジニアのための情報共有コミュニティ" class="ZennLogo_logoSvg__yOhcg" height="22"><title>Zenn</title><g fill="currentColor"><path d="M233,56.8h-39c0.5,3.5,2.2,6.8,4.8,9.2c2.7,2.3,6.2,3.5,9.8,3.4c2.8,0,5.6-0.5,8.2-1.7c2.5-1.1,4.8-2.8,6.5-5l8.2,9.5 c-2.5,3.4-5.7,6.1-9.5,7.9c-4.6,2.2-9.6,3.3-14.7,3.2c-5.7,0.1-11.4-1.2-16.5-4c-4.5-2.5-8.2-6.3-10.7-10.9s-3.8-9.8-3.7-15.1v-2.2 c-0.1-5.7,1.1-11.3,3.5-16.5c2.2-4.7,5.7-8.6,10.1-11.3c4.7-2.8,10.1-4.2,15.5-4.1c5.2-0.1,10.3,1.1,14.9,3.7 c4.1,2.5,7.4,6.2,9.4,10.5c2.2,5.1,3.3,10.5,3.2,16.1V56.8z M216.1,43.9c0.1-2.9-0.9-5.7-2.8-7.9c-1.8-1.9-4.4-2.9-7.9-2.9 c-2.9-0.1-5.8,1.1-7.7,3.2c-2,2.6-3.3,5.7-3.6,9h22V43.9z"></path><path d="M128.3,67.9h36.1v14.7h-56.9V72l35.8-54.3h-36.2V2.9h56.6v10.4L128.3,67.9z"></path><path d="M248.8,50.7c0-19.1,12.7-29.2,28.2-29.2s27.9,10.1,27.9,29.2V82h-16V51.4c0-10.6-4.8-16.1-12-16.1s-12.4,5.5-12.4,16.1 v30.7h-15.8L248.8,50.7L248.8,50.7z"></path><path d="M320.3,50.7c0-19.1,12.7-29.2,28.2-29.2s27.9,10.1,27.9,29.2V82h-16V51.4c0-10.6-4.8-16.1-12-16.1S336,40.8,336,51.4v30.7 h-15.8L320.3,50.7L320.3,50.7z"></path></g><path fill="#3EA8FF" class="st0" d="M2.4,83.3h17c0.9,0,1.7-0.5,2.2-1.2L68.4,5.2C69,4.2,68.3,3,67.1,3H51c-0.8,0-1.5,0.4-1.9,1.1L1.6,81.9 C1.3,82.5,1.7,83.3,2.4,83.3z"></path><path fill="#3EA8FF" class="st0" d="M61,82.1l22.1-35.5c0.7-1.1-0.1-2.5-1.4-2.5H65.7c-0.6,0-1.2,0.3-1.5,0.8L41.5,81.2c-0.6,0.9,0.1,2.1,1.2,2.1 h16.3C59.8,83.3,60.6,82.9,61,82.1z"></path></svg></a></div></div></header><article class="View_container__VQuzA"><aside class="ContentStickyNavForMobile_container__153a8"><div class="Container_wide__ykGLh Container_common__figYY"><div class="ContentStickyNavForMobile_inner__xJ_fS"><div class="ContentStickyNavForMobile_avatarInner__3_noS"><a class="ContentStickyNavForMobile_principalLink__c15vw" href="/p/knowledgesense"><span class="ContentStickyNavForMobile_avatarContainer__V3P_G"><img alt="ナレッジセンス - AI知見共有ブログ" class="AvatarImage_border__pDIjF AvatarImage_plain__Fgp4R AvatarImage_withPublication__hNtIe" height="38" referrerPolicy="no-referrer" src="https://zenn.dev/images/default-publication-avatar.png" width="38"/></span><span class="ContentStickyNavForMobile_displayName__cmEag">ナレッジセンス - AI知見共有ブログ</span></a></div><div class="ContentStickyNavForMobile_actions__a9fMk"></div></div></div></aside><div class="PublicationInfoForDesktop_container__UxKig"><div class="Container_wide__ykGLh Container_common__figYY"><div class="PublicationInfoForDesktop_inner__LnToC"><a class="PublicationInfoForDesktop_link__fqxgJ" href="/p/knowledgesense"><span class="PublicationInfoForDesktop_avatar__qh_yu"><img alt="" class="AvatarImage_border__pDIjF AvatarImage_plain__Fgp4R AvatarImage_withPublication__hNtIe" height="45" referrerPolicy="no-referrer" src="https://zenn.dev/images/default-publication-avatar.png" width="45"/></span><span class="PublicationInfoForDesktop_displayName__nzipD">ナレッジセンス - AI知見共有ブログ</span></a><span class="PublicationInfoForDesktop_label__d4mY3"><a class="PublicationLabelLink_link__JBqs2" href="/faq#what-is-publication">Publicationへの投稿</a></span></div></div></div><header class="ArticleHeader_header__IRbtk"><div class="Container_wide__ykGLh Container_common__figYY"><div class="ArticleHeader_main__W98WY"><div class="ArticleHeader_emoji__30JiU"><span class="Emoji_twemoji__hcxYF"><span class="Emoji_twemojiImg__Oc6vR" style="background-image:url(https://asia-northeast1-zenn-dev-production.cloudfunctions.net/twemoji/📉.svg)"></span></span><span class="Emoji_nativeEmoji__GMBzX">📉</span></div><h1 class="ArticleHeader_title__9jiOv"><span style="font-size:0.892em">RAGの検索性能を90%も低下させるテキストの落とし穴</span></h1><div class="ArticleHeader_metaContainer__5UzrJ"><div class="ArticleHeader_metaInfo__XrRdh"><div class="ArticleHeader_userInfo__g_sSW"><a class="ArticleHeader_avatar__anCEE" href="/sasakuna"><img alt="" class="AvatarImage_plain__Fgp4R " height="25" loading="lazy" referrerPolicy="no-referrer" src="https://storage.googleapis.com/zenn-user-upload/avatar/fd356008ae.jpeg" width="25"/></a><a class="ArticleHeader_metaUserName__FbZgW" href="/sasakuna">sasakuna</a></div><span class="ArticleHeader_pubDate__gF_sc"><span class="ArticleHeader_num__7Zpz0">2025/03/18</span>に公開</span></div></div></div></div></header><div class="Container_wide__ykGLh Container_common__figYY"><div class="ContainerUndo_undoInSM__1vdc1"><div class="View_inner__LlCJG"><div class="View_stickyShare__TsaVf"><div class="View_stickyShareInner__FLu2S"><div class="LikeButton_container__YlckE style-large-white count-bottom"><button aria-label="いいね" class="LikeButton_button__ZwdG4" data-pressed="false"><svg class="LikeButton_svgLike__Gl0Sz" viewBox="0 0 110 110"><path class="LikeButton_svgLikeLine__f9txR" d="M73,24a23.78,23.78,0,0,0-15.89,6.19,3.14,3.14,0,0,1-4.18,0A23.81,23.81,0,0,0,37,24a22,22,0,0,0-22,22c0,16.67,19.64,32.82,25.11,37.93,2.84,2.65,6.15,5.64,8.92,8.13a8.9,8.9,0,0,0,11.9,0c2.77-2.49,6.07-5.48,8.91-8.13C75.37,78.81,95,62.66,95,46A22,22,0,0,0,73,24Z" fill="currentColor"></path><path class="LikeButton_svgLikeInner__uiexS" d="M66.25,76.42c-.71.64-1.32,1.2-1.82,1.67-2.51,2.33-5.39,5-7.94,7.25a2.21,2.21,0,0,1-3,0C51,83,48.1,80.42,45.59,78.09c-.5-.47-1.12-1-1.82-1.67C38.09,71.29,23,57.67,23,46A14,14,0,0,1,37,32a15.92,15.92,0,0,1,11.65,5.23l4.73,5a2.2,2.2,0,0,0,3.23,0l4.72-5A16.06,16.06,0,0,1,73,32,14,14,0,0,1,87,46C87,57.67,71.93,71.29,66.25,76.42Z" fill="currentColor"></path><g class="LikeButton_svgLikeDecoration__78UjB"><circle cx="41.5" cy="9.5" fill="#3ea8ff" r="3.5"></circle><circle cx="98.5" cy="26.5" fill="#ffdc6e" r="3.5"></circle><circle cx="13" cy="19" fill="#c067f4" r="5"></circle><circle cx="77" cy="9" fill="#f76685" r="5"></circle><circle cx="26.5" cy="92.5" fill="#f76685" r="3.5"></circle><circle cx="105.5" cy="48.5" fill="#c067f4" r="3.5"></circle><circle cx="4.5" cy="60.5" fill="#3ea8ff" r="3.5"></circle><circle cx="94.5" cy="73.5" fill="#3ea8ff" r="1.5"></circle><circle cx="16.5" cy="75.5" fill="#ffdc6e" r="1.5"></circle><circle cx="78.5" cy="91.5" fill="#ffdc6e" r="1.5"></circle></g></svg></button></div><span style="display:block;height:1rem;flex-shrink:0"></span><span style="display:block;height:1rem;flex-shrink:0"></span><div class="ShareButtonsExperimental_container__CrtBj" style="flex-direction:column;align-items:stretch"><a aria-label="X(Twitter)にポスト" class="ShareButtonsExperimental_button__d9aXF ShareButtonsExperimental_svgTwitterVertical__NmmfB ShareButtonsExperimental_svgTwitterBase__8FrNq" data-tooltip-for-desktop="true" data-tooltip-position="bottom" href="https://twitter.com/intent/tweet?url=https://zenn.dev/knowledgesense/articles/ff2c528acf6b04&amp;text=RAG%E3%81%AE%E6%A4%9C%E7%B4%A2%E6%80%A7%E8%83%BD%E3%82%9290%EF%BC%85%E3%82%82%E4%BD%8E%E4%B8%8B%E3%81%95%E3%81%9B%E3%82%8B%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%81%AE%E8%90%BD%E3%81%A8%E3%81%97%E7%A9%B4%EF%BD%9Csasakuna&amp;hashtags=zenn" id="gtm-article-left-tweet" rel="nofollow noopener noreferrer" role="tooltip" target="_blank"><svg width="27" height="28" viewBox="0 0 27 28" fill="none"><g clip-path="url(#clip0_1_18)"><path d="M16.0687 11.7356L26.12 0H23.7382L15.0106 10.1899L8.03988 0H0L10.5411 15.4089L0 27.7155H2.38199L11.5985 16.9546L18.9601 27.7155H27L16.0681 11.7356H16.0687ZM12.8062 15.5447L11.7382 14.0103L3.24025 1.80106H6.89884L13.7568 11.6543L14.8248 13.1887L23.7393 25.9963H20.0807L12.8062 15.5452V15.5447Z" fill="currentColor"></path></g><defs><clipPath id="clip0_1_18"><rect width="27" height="27.7297" fill="white"></rect></clipPath></defs></svg></a><a aria-label="Facebookに投稿" class="ShareButtonsExperimental_button__d9aXF ShareButtonsExperimental_svgFacebookVertical__3ykb7 ShareButtonsExperimental_svgFacebookBase___gDOW" data-tooltip-for-desktop="true" data-tooltip-position="bottom" href="http://www.facebook.com/sharer.php?u=https://zenn.dev/knowledgesense/articles/ff2c528acf6b04" id="gtm-article-left-facebook" rel="nofollow noopener noreferrer" role="tooltip" target="_blank"><svg width="28" height="29" viewBox="0 0 28 29" fill="none"><g clip-path="url(#clip0_2_21)"><path d="M28 14.7812C28 7.02084 21.7319 0.729736 14 0.729736C6.26808 0.729736 0 7.02084 0 14.7812C0 21.3707 4.52032 26.9002 10.6182 28.4189V19.0753H7.73136V14.7812H10.6182V12.9309C10.6182 8.14833 12.7747 5.93158 17.453 5.93158C18.34 5.93158 19.8705 6.10637 20.4966 6.28061V10.1729C20.1662 10.138 19.5922 10.1206 18.8793 10.1206C16.5838 10.1206 15.6968 10.9935 15.6968 13.2625V14.7812H20.2698L19.4841 19.0753H15.6968V28.7297C22.629 27.8895 28.0006 21.9654 28.0006 14.7812H28Z" fill="currentColor"></path></g><defs><clipPath id="clip0_2_21"><rect width="28" height="28" fill="white" transform="translate(0 0.729736)"></rect></clipPath></defs></svg></a><a aria-label="はてなブックマークに登録" class="ShareButtonsExperimental_button__d9aXF ShareButtonsExperimental_svgHatenaBookmarkVertical__b161J ShareButtonsExperimental_svgHatenaBookmarkBase__VXz6p" data-tooltip-for-desktop="true" data-tooltip-position="bottom" href="https://b.hatena.ne.jp/add?mode=confirm&amp;url=https://zenn.dev/knowledgesense/articles/ff2c528acf6b04&amp;title=RAG%E3%81%AE%E6%A4%9C%E7%B4%A2%E6%80%A7%E8%83%BD%E3%82%9290%EF%BC%85%E3%82%82%E4%BD%8E%E4%B8%8B%E3%81%95%E3%81%9B%E3%82%8B%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%81%AE%E8%90%BD%E3%81%A8%E3%81%97%E7%A9%B4%EF%BD%9Csasakuna" id="gtm-article-left-hatena-bookmark" rel="nofollow noopener noreferrer" role="tooltip" target="_blank"><svg width="27" height="28" viewBox="0 0 27 28" fill="none"><path fill-rule="evenodd" clip-rule="evenodd" d="M5.4999 0.729736H21.5001C24.5376 0.729736 27 3.19213 27 6.22964V22.2298C27 25.2673 24.5376 27.7297 21.5001 27.7297H5.4999C2.46239 27.7297 0 25.2673 0 22.2298V6.22964C0 3.19213 2.46239 0.729736 5.4999 0.729736ZM12.98 13.7472C13.8521 13.8136 14.5319 14.1209 15.0212 14.6673V14.6679C15.512 15.2127 15.7572 15.9444 15.7572 16.8543C15.7572 17.5126 15.62 18.0877 15.3436 18.5867C15.0692 19.084 14.6756 19.4712 14.1604 19.7433C13.7441 19.9658 13.2289 20.1278 12.6155 20.2245C12.001 20.3179 10.9966 20.3665 9.6039 20.3665H6.13656V8.09392H9.50184C10.8859 8.09392 11.8503 8.13875 12.4016 8.22623C12.9503 8.31695 13.4206 8.4703 13.8164 8.68792C14.269 8.93902 14.6129 9.27437 14.8527 9.69341C15.0865 10.1151 15.2069 10.6028 15.2069 11.1546C15.2069 11.8512 15.0309 12.4053 14.6761 12.813C14.3197 13.2261 13.756 13.5355 12.98 13.7472ZM9.96192 10.8144H9.24372H9.24318V13.2768H9.91116C10.7352 13.2768 11.2984 13.1856 11.6073 13.0058C11.9124 12.8227 12.0663 12.5273 12.0663 12.0667C12.0663 11.6061 11.9216 11.2821 11.6375 11.0947C11.3497 10.9084 10.7919 10.8144 9.96192 10.8144ZM10.4128 18.1514C11.2028 18.1514 11.7661 18.0531 12.0928 17.8523V17.8528C12.4227 17.653 12.5863 17.3209 12.5863 16.8581C12.5863 16.3408 12.4367 15.9806 12.1338 15.7765C11.8357 15.5724 11.2747 15.4703 10.463 15.4703H9.24372V18.1514H10.4128ZM19.3093 17.257C18.4502 17.257 17.7547 17.9525 17.7547 18.8111C17.7547 19.6697 18.4507 20.3658 19.3093 20.3658C20.1679 20.3658 20.8634 19.6697 20.8634 18.8111C20.8634 17.9525 20.1668 17.257 19.3093 17.257ZM17.9593 8.09318H20.6593V16.2753H17.9593V8.09318Z" fill="currentColor"></path></svg></a></div></div></div><div class="View_columnsContainer__ijfFN"><section class="View_content__OZ_Dc"><div class="View_main__AU6KW"><div class="Container_default__13H8g Container_common__figYY"><div class="View_topics__2sHkl"><a class="View_topicLink__jdtX_" href="/topics/embedding"><div class="View_topicImage__qMmmw"><img class="View_topicImg__TpkV5" loading="lazy" src="https://zenn.dev/images/topic.png"/></div><div class="View_topicName____nYp">embedding</div></a><a class="View_topicLink__jdtX_" href="/topics/rag"><div class="View_topicImage__qMmmw"><img class="View_topicImg__TpkV5" loading="lazy" src="https://zenn.dev/images/topic.png"/></div><div class="View_topicName____nYp">RAG</div></a><a class="View_topicLink__jdtX_" href="/tech-or-idea"><div class="View_topicImage__qMmmw"><img class="View_topicImg__TpkV5" loading="lazy" src="https://static.zenn.studio/images/drawing/tech-icon.svg"/></div><div class="View_topicName____nYp" style="text-transform:capitalize">tech</div></a></div><div class="InsertButtonToCodeBlock_insertButtonWrapper__ueql2"><div class="znc BodyContent_anchorToHeadings__uGxNv"><h2 id="%E5%B0%8E%E5%85%A5" data-line="0" class="code-line"> <a class="header-anchor-link" href="#%E5%B0%8E%E5%85%A5" aria-hidden="true"></a> 導入</h2> <p data-line="1" class="code-line">こんにちは、株式会社ナレッジセンスの須藤英寿です。</p> <p data-line="3" class="code-line">今回は、RAGの要であるEmbeddingの性能を大きく低下させてしまう、文章の特性について解説します。<br> このブログで紹介している内容は以下の論文を元に作成しておりますので、詳細はそちらをご確認ください。RAGを構成してみたが、どうしても正解の文章を取ってこれない!そんなときはもしかするとこの論文で紹介されているような文章になってしまっているかもしれません。</p> <p data-line="6" class="code-line"><span class="embed-block zenn-embedded zenn-embedded-card"><iframe id="zenn-embedded__8c27c04700ebe" src="https://embed.zenn.studio/card#zenn-embedded__8c27c04700ebe" data-content="https%3A%2F%2Farxiv.org%2Fpdf%2F2503.05037" frameborder="0" scrolling="no" loading="lazy"></iframe></span><a href="https://arxiv.org/pdf/2503.05037" style="display:none" target="_blank" rel="nofollow noopener noreferrer">https://arxiv.org/pdf/2503.05037</a></p> <p data-line="8" class="code-line"><img src="https://storage.googleapis.com/zenn-user-upload/10258576e44f-20250315.png" loading="lazy" class="md-img"></p> <h2 id="%E3%82%B5%E3%83%9E%E3%83%AA%E3%83%BC" data-line="10" class="code-line"> <a class="header-anchor-link" href="#%E3%82%B5%E3%83%9E%E3%83%AA%E3%83%BC" aria-hidden="true"></a> サマリー</h2> <p data-line="12" class="code-line">Embeddingは、RAGの検索能力の根幹に関わる機能ですが、そのの性能や特性についてはあまり知られてはいません。実は、保管するテキストの文体や分割方法次第で最大90%程度、検索性能が下がってしまいます。</p> <p data-line="14" class="code-line">今回紹介する論文では、Embeddingの性能を著しく下げるテキストの特徴を調べ、その性質についてまとめています。特に「文章の位置」、「使用する単語」、「文章量」による影響が大きいとされています。残念ながら具体的な対策は提示されていませんので、対応策の例として、これまでに紹介した記事をあわせて明記しておきました。そちらもぜひ、ご確認ください。</p> <h2 id="%E5%95%8F%E9%A1%8C%E6%84%8F%E8%AD%98" data-line="17" class="code-line"> <a class="header-anchor-link" href="#%E5%95%8F%E9%A1%8C%E6%84%8F%E8%AD%98" aria-hidden="true"></a> 問題意識</h2> <h3 id="embedding%E3%81%AE%E7%89%B9%E5%BE%B4%E3%81%A8%E6%95%B0%E5%80%A4%E7%9A%84%E3%81%AA%E6%84%8F%E5%91%B3" data-line="19" class="code-line"> <a class="header-anchor-link" href="#embedding%E3%81%AE%E7%89%B9%E5%BE%B4%E3%81%A8%E6%95%B0%E5%80%A4%E7%9A%84%E3%81%AA%E6%84%8F%E5%91%B3" aria-hidden="true"></a> Embeddingの特徴と数値的な意味</h3> <p data-line="21" class="code-line">Embeddingがどういったものかおさらいします。Embeddingは、主に数百文字程度の文章を数百次元以上のベクトルデータに置き換える機能を有しています。そして、このベクトルデータには以下のような特徴があります。</p> <ul data-line="23" class="code-line"> <li data-line="23" class="code-line">ベクトルデータは、文章の意味を表している</li> <li data-line="24" class="code-line">ベクトルデータの内積が1に近いほど2つの文章の意味は類似している</li> </ul> <p data-line="26" class="code-line">厳密には異なる特性を持つものもありますが、RAGについて考えるうえではこの理解で十分です。特に内積を取るだけで文章の類似度がわかるという性質のお陰で、EmbeddingはRAGの主要な検索機能としての役割を果たしています。</p> <h3 id="embedding%E3%81%AE%E9%99%90%E7%95%8C" data-line="28" class="code-line"> <a class="header-anchor-link" href="#embedding%E3%81%AE%E9%99%90%E7%95%8C" aria-hidden="true"></a> Embeddingの限界</h3> <p data-line="30" class="code-line">しかし、Embeddingにも限界があり一般的には「固有名詞に弱い」と言われています。これは、Embeddingの学習内で利用されてこなかった単語の意味をEmbeddingが理解できないためです。これは一つの例ですが、論文内では更に以下の要素によってEmbeddingの性能が下がると言及されています。</p> <ul data-line="32" class="code-line"> <li data-line="32" class="code-line">位置バイアス: 重要な情報が文章内のどの位置にあるか</li> <li data-line="33" class="code-line">単語バイアス: 意味と関係なく同じ単語が含まれているか</li> <li data-line="34" class="code-line">文章量バイアス: 文章量が多いか少ないか</li> </ul> <h2 id="embedding%E3%81%AE%E6%80%A7%E8%83%BD%E3%82%92%E5%BC%95%E3%81%8D%E4%B8%8B%E3%81%92%E3%82%8B%E6%96%87%E7%AB%A0%E3%81%AE%E7%89%B9%E5%BE%B4" data-line="36" class="code-line"> <a class="header-anchor-link" href="#embedding%E3%81%AE%E6%80%A7%E8%83%BD%E3%82%92%E5%BC%95%E3%81%8D%E4%B8%8B%E3%81%92%E3%82%8B%E6%96%87%E7%AB%A0%E3%81%AE%E7%89%B9%E5%BE%B4" aria-hidden="true"></a> Embeddingの性能を引き下げる文章の特徴</h2> <p data-line="38" class="code-line">ここでは、各種の文章の傾向によって生じる文章間の類似度の低下についてまとめています。比較に使用されている数値は、割合を表すものではなくt検定と呼ばれる比較手法で得られる値(t値)で、値が大きいほどずれが大きいことを示しています。</p> <h3 id="%E4%BD%8D%E7%BD%AE%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9" data-line="40" class="code-line"> <a class="header-anchor-link" href="#%E4%BD%8D%E7%BD%AE%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9" aria-hidden="true"></a> 位置バイアス</h3> <p data-line="42" class="code-line"><img src="https://storage.googleapis.com/zenn-user-upload/ca2f7e36fe8f-20250317.png" loading="lazy" class="md-img"></p> <p data-line="44" class="code-line">入力したクエリに対する回答が、どの位置にあるかによって類似度がどのように変化するかを示したグラフです。文章の先頭にある状態が最も類似度の高い状態で、そこから減少していきモデルごとに最も性能が下がるポイントは異なりますが、誤差の範囲の7~20倍ものズレが発生していまうことを示しています。</p> <p data-line="46" class="code-line">【対抗策】</p> <ul data-line="47" class="code-line"> <li data-line="47" class="code-line"><a href="https://zenn.dev/knowledgesense/articles/fe155b25510683" target="_blank">RAGで人間の脳を再現する</a></li> <li data-line="48" class="code-line"><a href="https://zenn.dev/knowledgesense/articles/077ad1ab0f9ff6" target="_blank">RAGの「ベクトル検索」の弱みを、ナレッジグラフで補う</a></li> <li data-line="49" class="code-line"><a href="https://zenn.dev/knowledgesense/articles/71e3d2d2ff3858" target="_blank">RAGのチャンク化を最適化する「RAPTOR」について</a></li> </ul> <h3 id="%E5%8D%98%E8%AA%9E%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9" data-line="52" class="code-line"> <a class="header-anchor-link" href="#%E5%8D%98%E8%AA%9E%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9" aria-hidden="true"></a> 単語バイアス</h3> <p data-line="54" class="code-line"><img src="https://storage.googleapis.com/zenn-user-upload/17e688ba21f5-20250317.png" loading="lazy" class="md-img"></p> <p data-line="56" class="code-line">意味は同じだが、表記が異なる(例えば"US"と"United States")とドキュメントの類似度に誤差の範囲の20倍以上の差が生まれます。たとえば、「United States」を含むクエリで検索した場合、対象のドキュメントが「United States」なのか「US」なのかで、大きな差が生まれることになります。</p> <p data-line="58" class="code-line">【対抗策】</p> <ul data-line="59" class="code-line"> <li data-line="59" class="code-line"><a href="https://zenn.dev/knowledgesense/articles/fe155b25510683" target="_blank">RAGで人間の脳を再現する</a></li> <li data-line="60" class="code-line"><a href="https://zenn.dev/knowledgesense/articles/077ad1ab0f9ff6" target="_blank">RAGの「ベクトル検索」の弱みを、ナレッジグラフで補う</a></li> <li data-line="61" class="code-line"><a href="https://zenn.dev/knowledgesense/articles/abf34c417b079e" target="_blank">RAGの精度と速度を同時に向上「DIVA」による曖昧さ対策</a></li> </ul> <h3 id="%E6%96%87%E7%AB%A0%E9%87%8F%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9" data-line="63" class="code-line"> <a class="header-anchor-link" href="#%E6%96%87%E7%AB%A0%E9%87%8F%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9" aria-hidden="true"></a> 文章量バイアス</h3> <p data-line="64" class="code-line"><img src="https://storage.googleapis.com/zenn-user-upload/28b98b6f8610-20250317.png" loading="lazy" class="md-img"></p> <p data-line="66" class="code-line">保管されている文章の長さによる類似度の比較結果です。この結果は正解となる文章だけの場合と、正解とは無関係な文章を含んだ場合で、類似度にどの程度差が生まれるかを示しています。</p> <p data-line="69" class="code-line">【対抗策】</p> <ul data-line="70" class="code-line"> <li data-line="70" class="code-line"><a href="https://zenn.dev/knowledgesense/articles/fe155b25510683" target="_blank">RAGで人間の脳を再現する</a></li> <li data-line="71" class="code-line"><a href="https://zenn.dev/knowledgesense/articles/71e3d2d2ff3858" target="_blank">RAGのチャンク化を最適化する「RAPTOR」について</a></li> <li data-line="72" class="code-line"><a href="https://zenn.dev/knowledgesense/articles/e0ade68c265200" target="_blank">RAGの「文脈が消える問題」を解決する「LongRAG」</a></li> </ul> <h3 id="%E7%B2%BE%E5%BA%A6%E3%81%B8%E3%81%AE%E5%BD%B1%E9%9F%BF" data-line="75" class="code-line"> <a class="header-anchor-link" href="#%E7%B2%BE%E5%BA%A6%E3%81%B8%E3%81%AE%E5%BD%B1%E9%9F%BF" aria-hidden="true"></a> 精度への影響</h3> <p data-line="77" class="code-line"><img src="https://storage.googleapis.com/zenn-user-upload/6b51b8f595b4-20250317.png" loading="lazy" class="md-img"></p> <p data-line="79" class="code-line">正解を含むが関係のない文章も含む正解ドキュメントと、正解を含まないがEmbeddingが一致していると導きたくなる特徴を持つ不正解ドキュメントを比較して、正解ドキュメントを類似していると判断できるかを実験しています。結果として、圧倒的に不正解ドキュメントを取得する傾向にあり、正解ドキュメントを選べる確率はほぼ0%となっています。</p> <h2 id="%E3%81%BE%E3%81%A8%E3%82%81" data-line="81" class="code-line"> <a class="header-anchor-link" href="#%E3%81%BE%E3%81%A8%E3%82%81" aria-hidden="true"></a> まとめ</h2> <p data-line="83" class="code-line">ドキュメントの特徴による、Embedding性能への影響についてまとめた論文を紹介しました。最後の「精度の影響」に記載されるような極端な例はそう多くは無いですが、これらのバイアスが反映されることで知らず知らずのうちにRAGの性能を引き下げることがあります。実際にRAGを運用していると、他にも、「意味的な区切りを無視して文章を分割してしまうことで意味が損なわれるケース」や、「パソコンの再起動」と「アプリの再起動」のように言葉としては似ているが、使われ方が全く違うような言葉が類似していると判定されてしまうといったケースがあげられます。汎用的に問題に対処するのは難しいので、自身のユースケースを見極めて、それにあった対処法を見つけることが重要です。対抗策に示された手法も、一つの対策になるのでぜひご活用ください。</p> </div></div><div class="View_actions__s_UJk" id="share"><div class="LikeButton_container__YlckE style-large"><button aria-label="いいね" class="LikeButton_button__ZwdG4" data-pressed="false"><svg class="LikeButton_svgLike__Gl0Sz" viewBox="0 0 110 110"><path class="LikeButton_svgLikeLine__f9txR" d="M73,24a23.78,23.78,0,0,0-15.89,6.19,3.14,3.14,0,0,1-4.18,0A23.81,23.81,0,0,0,37,24a22,22,0,0,0-22,22c0,16.67,19.64,32.82,25.11,37.93,2.84,2.65,6.15,5.64,8.92,8.13a8.9,8.9,0,0,0,11.9,0c2.77-2.49,6.07-5.48,8.91-8.13C75.37,78.81,95,62.66,95,46A22,22,0,0,0,73,24Z" fill="currentColor"></path><path class="LikeButton_svgLikeInner__uiexS" d="M66.25,76.42c-.71.64-1.32,1.2-1.82,1.67-2.51,2.33-5.39,5-7.94,7.25a2.21,2.21,0,0,1-3,0C51,83,48.1,80.42,45.59,78.09c-.5-.47-1.12-1-1.82-1.67C38.09,71.29,23,57.67,23,46A14,14,0,0,1,37,32a15.92,15.92,0,0,1,11.65,5.23l4.73,5a2.2,2.2,0,0,0,3.23,0l4.72-5A16.06,16.06,0,0,1,73,32,14,14,0,0,1,87,46C87,57.67,71.93,71.29,66.25,76.42Z" fill="currentColor"></path><g class="LikeButton_svgLikeDecoration__78UjB"><circle cx="41.5" cy="9.5" fill="#3ea8ff" r="3.5"></circle><circle cx="98.5" cy="26.5" fill="#ffdc6e" r="3.5"></circle><circle cx="13" cy="19" fill="#c067f4" r="5"></circle><circle cx="77" cy="9" fill="#f76685" r="5"></circle><circle cx="26.5" cy="92.5" fill="#f76685" r="3.5"></circle><circle cx="105.5" cy="48.5" fill="#c067f4" r="3.5"></circle><circle cx="4.5" cy="60.5" fill="#3ea8ff" r="3.5"></circle><circle cx="94.5" cy="73.5" fill="#3ea8ff" r="1.5"></circle><circle cx="16.5" cy="75.5" fill="#ffdc6e" r="1.5"></circle><circle cx="78.5" cy="91.5" fill="#ffdc6e" r="1.5"></circle></g></svg></button></div><div class="View_menuAndShareButtonsContainer__PMtHU"><div class="View_menu__wgMxq"><button aria-label="その他の操作" class="PopoverMenuButton_menuButton__hKCa_"><svg viewBox="0 0 27 27" height="16" width="16"><path fill="currentColor" d="M12.74,20.53,3.48,11.35a.75.75,0,0,1,0-1.07L4.71,9.05a.75.75,0,0,1,1.07,0l7.49,7.41,7.49-7.41a.74.74,0,0,1,1.06,0l1.24,1.23a.77.77,0,0,1,0,1.07L13.8,20.53A.74.74,0,0,1,12.74,20.53Z"></path></svg></button></div><div class="ShareButtonsExperimental_container__CrtBj" style="flex-direction:row;align-items:center"><a aria-label="X(Twitter)にポスト" class="ShareButtonsExperimental_button__d9aXF ShareButtonsExperimental_svgTwitterHorizontal__5xDGM ShareButtonsExperimental_svgTwitterBase__8FrNq" data-tooltip-for-desktop="true" data-tooltip-position="bottom" href="https://twitter.com/intent/tweet?url=https://zenn.dev/knowledgesense/articles/ff2c528acf6b04&amp;text=RAG%E3%81%AE%E6%A4%9C%E7%B4%A2%E6%80%A7%E8%83%BD%E3%82%9290%EF%BC%85%E3%82%82%E4%BD%8E%E4%B8%8B%E3%81%95%E3%81%9B%E3%82%8B%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%81%AE%E8%90%BD%E3%81%A8%E3%81%97%E7%A9%B4%EF%BD%9Csasakuna&amp;hashtags=zenn" id="gtm-article-footer-tweet" rel="nofollow noopener noreferrer" role="tooltip" target="_blank"><svg width="27" height="28" viewBox="0 0 27 28" fill="none"><g clip-path="url(#clip0_1_18)"><path d="M16.0687 11.7356L26.12 0H23.7382L15.0106 10.1899L8.03988 0H0L10.5411 15.4089L0 27.7155H2.38199L11.5985 16.9546L18.9601 27.7155H27L16.0681 11.7356H16.0687ZM12.8062 15.5447L11.7382 14.0103L3.24025 1.80106H6.89884L13.7568 11.6543L14.8248 13.1887L23.7393 25.9963H20.0807L12.8062 15.5452V15.5447Z" fill="currentColor"></path></g><defs><clipPath id="clip0_1_18"><rect width="27" height="27.7297" fill="white"></rect></clipPath></defs></svg></a><a aria-label="Facebookに投稿" class="ShareButtonsExperimental_button__d9aXF ShareButtonsExperimental_svgFacebookHorizontal__MpfBm ShareButtonsExperimental_svgFacebookBase___gDOW" data-tooltip-for-desktop="true" data-tooltip-position="bottom" href="http://www.facebook.com/sharer.php?u=https://zenn.dev/knowledgesense/articles/ff2c528acf6b04" id="gtm-article-footer-facebook" rel="nofollow noopener noreferrer" role="tooltip" target="_blank"><svg width="28" height="29" viewBox="0 0 28 29" fill="none"><g clip-path="url(#clip0_2_21)"><path d="M28 14.7812C28 7.02084 21.7319 0.729736 14 0.729736C6.26808 0.729736 0 7.02084 0 14.7812C0 21.3707 4.52032 26.9002 10.6182 28.4189V19.0753H7.73136V14.7812H10.6182V12.9309C10.6182 8.14833 12.7747 5.93158 17.453 5.93158C18.34 5.93158 19.8705 6.10637 20.4966 6.28061V10.1729C20.1662 10.138 19.5922 10.1206 18.8793 10.1206C16.5838 10.1206 15.6968 10.9935 15.6968 13.2625V14.7812H20.2698L19.4841 19.0753H15.6968V28.7297C22.629 27.8895 28.0006 21.9654 28.0006 14.7812H28Z" fill="currentColor"></path></g><defs><clipPath id="clip0_2_21"><rect width="28" height="28" fill="white" transform="translate(0 0.729736)"></rect></clipPath></defs></svg></a><a aria-label="はてなブックマークに登録" class="ShareButtonsExperimental_button__d9aXF ShareButtonsExperimental_svgHatenaBookmarkHorizontal__ccpWn ShareButtonsExperimental_svgHatenaBookmarkBase__VXz6p" data-tooltip-for-desktop="true" data-tooltip-position="bottom" href="https://b.hatena.ne.jp/add?mode=confirm&amp;url=https://zenn.dev/knowledgesense/articles/ff2c528acf6b04&amp;title=RAG%E3%81%AE%E6%A4%9C%E7%B4%A2%E6%80%A7%E8%83%BD%E3%82%9290%EF%BC%85%E3%82%82%E4%BD%8E%E4%B8%8B%E3%81%95%E3%81%9B%E3%82%8B%E3%83%86%E3%82%AD%E3%82%B9%E3%83%88%E3%81%AE%E8%90%BD%E3%81%A8%E3%81%97%E7%A9%B4%EF%BD%9Csasakuna" id="gtm-article-footer-hatena-bookmark" rel="nofollow noopener noreferrer" role="tooltip" target="_blank"><svg width="27" height="28" viewBox="0 0 27 28" fill="none"><path fill-rule="evenodd" clip-rule="evenodd" d="M5.4999 0.729736H21.5001C24.5376 0.729736 27 3.19213 27 6.22964V22.2298C27 25.2673 24.5376 27.7297 21.5001 27.7297H5.4999C2.46239 27.7297 0 25.2673 0 22.2298V6.22964C0 3.19213 2.46239 0.729736 5.4999 0.729736ZM12.98 13.7472C13.8521 13.8136 14.5319 14.1209 15.0212 14.6673V14.6679C15.512 15.2127 15.7572 15.9444 15.7572 16.8543C15.7572 17.5126 15.62 18.0877 15.3436 18.5867C15.0692 19.084 14.6756 19.4712 14.1604 19.7433C13.7441 19.9658 13.2289 20.1278 12.6155 20.2245C12.001 20.3179 10.9966 20.3665 9.6039 20.3665H6.13656V8.09392H9.50184C10.8859 8.09392 11.8503 8.13875 12.4016 8.22623C12.9503 8.31695 13.4206 8.4703 13.8164 8.68792C14.269 8.93902 14.6129 9.27437 14.8527 9.69341C15.0865 10.1151 15.2069 10.6028 15.2069 11.1546C15.2069 11.8512 15.0309 12.4053 14.6761 12.813C14.3197 13.2261 13.756 13.5355 12.98 13.7472ZM9.96192 10.8144H9.24372H9.24318V13.2768H9.91116C10.7352 13.2768 11.2984 13.1856 11.6073 13.0058C11.9124 12.8227 12.0663 12.5273 12.0663 12.0667C12.0663 11.6061 11.9216 11.2821 11.6375 11.0947C11.3497 10.9084 10.7919 10.8144 9.96192 10.8144ZM10.4128 18.1514C11.2028 18.1514 11.7661 18.0531 12.0928 17.8523V17.8528C12.4227 17.653 12.5863 17.3209 12.5863 16.8581C12.5863 16.3408 12.4367 15.9806 12.1338 15.7765C11.8357 15.5724 11.2747 15.4703 10.463 15.4703H9.24372V18.1514H10.4128ZM19.3093 17.257C18.4502 17.257 17.7547 17.9525 17.7547 18.8111C17.7547 19.6697 18.4507 20.3658 19.3093 20.3658C20.1679 20.3658 20.8634 19.6697 20.8634 18.8111C20.8634 17.9525 20.1668 17.257 19.3093 17.257ZM17.9593 8.09318H20.6593V16.2753H17.9593V8.09318Z" fill="currentColor"></path></svg></a></div></div></div><span style="display:block;height:2rem;flex-shrink:0"></span><aside class="View_authorInfo__F19rR"><div class="ProfileCard_container__YfvQl"><a class="ProfileCard_avatar__tIJpR" href="/sasakuna"><img alt="sasakuna" class="AvatarImage_plain__Fgp4R " height="80" loading="lazy" referrerPolicy="no-referrer" src="https://storage.googleapis.com/zenn-user-upload/avatar/fd356008ae.jpeg" width="80"/></a><div class="ProfileCard_name__qXamf"><a class="ProfileCard_displayName__gRUeY" href="/sasakuna">sasakuna</a></div><div class="ProfileCard_content__1w905"><p class="Paragraph_common__yRSrj Paragraph_sidenote__9NTjJ Paragraph_decorateLink__aIAFh"><span>Knowledge Sense, Inc. CTO</span></p><div class="ProfileCard_actions__2ZjZ8"><span class="ProfileCard_follow__ng60N"></span><a aria-label="@sasa_kuna_" class="ProfileCard_twitterLink__l4sOK ProfileCard_linkBase__hVELe" data-tooltip-for-desktop="true" data-tooltip-position="bottom" href="https://twitter.com/sasa_kuna_" rel="nofollow noopener noreferrer" role="tooltip" target="_blank"><svg width="27" height="28" viewBox="0 0 27 28" fill="none" aria-label="X(Twitter)"><g clip-path="url(#clip0_1_18)"><path d="M16.0687 11.7356L26.12 0H23.7382L15.0106 10.1899L8.03988 0H0L10.5411 15.4089L0 27.7155H2.38199L11.5985 16.9546L18.9601 27.7155H27L16.0681 11.7356H16.0687ZM12.8062 15.5447L11.7382 14.0103L3.24025 1.80106H6.89884L13.7568 11.6543L14.8248 13.1887L23.7393 25.9963H20.0807L12.8062 15.5452V15.5447Z" fill="currentColor"></path></g><defs><clipPath id="clip0_1_18"><rect width="27" height="27.7297" fill="white"></rect></clipPath></defs></svg></a></div></div></div></aside><div class="View_publicationInfo__MKZ62"><div class="ProfileCard_container__YfvQl"><a class="ProfileCard_avatar__tIJpR" href="/p/knowledgesense"><img alt="ナレッジセンス - AI知見共有ブログ" class="AvatarImage_plain__Fgp4R AvatarImage_withPublication__hNtIe" height="80" loading="lazy" referrerPolicy="no-referrer" src="https://zenn.dev/images/default-publication-avatar.png" width="80"/></a><div class="ProfileCard_name__qXamf"><a class="ProfileCard_displayName__gRUeY" href="/p/knowledgesense">ナレッジセンス - AI知見共有ブログ</a><span><a class="PublicationLabelLink_link__JBqs2" href="/faq#what-is-publication">Publication</a></span></div><div class="ProfileCard_content__1w905"><p class="Paragraph_common__yRSrj Paragraph_sidenote__9NTjJ Paragraph_decorateLink__aIAFh"><span>株式会社ナレッジセンスは、「大企業の知的活動を最速にする」をミッションに掲げ、社内データ検索ができるAIチャットボットを開発・提供しているスタートアップです。このブログでは、LLMや検索技術、RAGの実装戦略などについて知見を共有していきます。</span></p><div class="ProfileCard_actions__2ZjZ8"><span class="ProfileCard_follow__ng60N"></span></div></div></div></div></div></div><div id="discuss"><div class="ArticleComments_commentsContainer__kOO0n"><section class="ArticleComments_comments__y4Azs"><div class="ArticleComments_emptyContainer__I4fw6"><h3 class="Heading_size-lg__KD3Up Heading_centered__lNLF_">Discussion</h3><img class="ArticleComments_emptyImg__FGwCr" src="https://static.zenn.studio/images/drawing/discussion.png" width="300"/></div></section></div></div></section><aside class="View_sidebarContainer__YwcNH"><div class="ArticleSidebar_container__jSRJw"><div><div class="ArticleSidebar_user__vJ7nz ArticleSidebar_sidebarCard__AtM_Z"><div class="SidebarUserBio_container__iWemi"><a href="/sasakuna"><img alt="sasakuna" class="AvatarImage_border__pDIjF AvatarImage_plain__Fgp4R " height="60" loading="lazy" referrerPolicy="no-referrer" src="https://storage.googleapis.com/zenn-user-upload/avatar/fd356008ae.jpeg" width="60"/></a><div class="SidebarUserBio_author__cM7pP"><a class="SidebarUserBio_name__0zFdT" href="/sasakuna">sasakuna</a><div class="SidebarUserBio_actions__oFupD"><a class="SidebarUserBio_twitterLink__yGgDq SidebarUserBio_link__nnh24" href="https://twitter.com/sasa_kuna_" rel="nofollow noopener noreferrer" target="_blank"><span aria-label="@sasa_kuna_" data-tooltip-for-desktop="true" data-tooltip-position="bottom" role="tooltip"><svg width="27" height="28" viewBox="0 0 27 28" fill="none" aria-label="X(Twitter)"><g clip-path="url(#clip0_1_18)"><path d="M16.0687 11.7356L26.12 0H23.7382L15.0106 10.1899L8.03988 0H0L10.5411 15.4089L0 27.7155H2.38199L11.5985 16.9546L18.9601 27.7155H27L16.0681 11.7356H16.0687ZM12.8062 15.5447L11.7382 14.0103L3.24025 1.80106H6.89884L13.7568 11.6543L14.8248 13.1887L23.7393 25.9963H20.0807L12.8062 15.5452V15.5447Z" fill="currentColor"></path></g><defs><clipPath id="clip0_1_18"><rect width="27" height="27.7297" fill="white"></rect></clipPath></defs></svg></span></a></div></div></div><span style="display:block;height:1rem;flex-shrink:0"></span><p class="Paragraph_common__yRSrj Paragraph_description-sm__vmr99 Paragraph_decorateLink__aIAFh"><span>Knowledge Sense, Inc. CTO</span></p></div></div><span style="display:block;height:1.5rem;flex-shrink:0"></span><div class="ArticleSidebar_sticky__W61mq"><div style="display:flex;gap:1.5rem;flex-direction:column;align-items:stretch;flex-wrap:nowrap"><div class="ArticleSidebarToc_toc__dUPn8"><div class="ArticleSidebarToc_tocTitle__A3VjO">目次</div><div class="ArticleToc_toc__WF75u"><ol class="ol-depth-1"><li><a href="#%E5%B0%8E%E5%85%A5">導入</a></li><li><a href="#%E3%82%B5%E3%83%9E%E3%83%AA%E3%83%BC">サマリー</a></li><li><a href="#%E5%95%8F%E9%A1%8C%E6%84%8F%E8%AD%98">問題意識</a><ol class="ol-depth-2"><li><a href="#embedding%E3%81%AE%E7%89%B9%E5%BE%B4%E3%81%A8%E6%95%B0%E5%80%A4%E7%9A%84%E3%81%AA%E6%84%8F%E5%91%B3">Embeddingの特徴と数値的な意味</a></li><li><a href="#embedding%E3%81%AE%E9%99%90%E7%95%8C">Embeddingの限界</a></li></ol></li><li><a href="#embedding%E3%81%AE%E6%80%A7%E8%83%BD%E3%82%92%E5%BC%95%E3%81%8D%E4%B8%8B%E3%81%92%E3%82%8B%E6%96%87%E7%AB%A0%E3%81%AE%E7%89%B9%E5%BE%B4">Embeddingの性能を引き下げる文章の特徴</a><ol class="ol-depth-2"><li><a href="#%E4%BD%8D%E7%BD%AE%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9">位置バイアス</a></li><li><a href="#%E5%8D%98%E8%AA%9E%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9">単語バイアス</a></li><li><a href="#%E6%96%87%E7%AB%A0%E9%87%8F%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9">文章量バイアス</a></li><li><a href="#%E7%B2%BE%E5%BA%A6%E3%81%B8%E3%81%AE%E5%BD%B1%E9%9F%BF">精度への影響</a></li></ol></li><li><a href="#%E3%81%BE%E3%81%A8%E3%82%81">まとめ</a></li></ol></div></div></div></div></div></aside></div></div></div></div><div id="related-contents"></div></article><footer class="AppFooter_footer__pqSnY"><div class="Container_wide__ykGLh Container_common__figYY"><div class="AppFooter_inner__uGxbT" data-nosnippet="true"><div class="AppFooter_brandingColumn__BikTT"><a class="ZennLogo_logoLink__NJS2l" href="/"><svg x="0px" y="0px" viewBox="0 0 377.4 88.3" aria-label="Zenn | エンジニアのための情報共有コミュニティ" class="ZennLogo_logoSvg__yOhcg" height="20" width="85"><title>Zenn</title><g fill="currentColor"><path d="M233,56.8h-39c0.5,3.5,2.2,6.8,4.8,9.2c2.7,2.3,6.2,3.5,9.8,3.4c2.8,0,5.6-0.5,8.2-1.7c2.5-1.1,4.8-2.8,6.5-5l8.2,9.5 c-2.5,3.4-5.7,6.1-9.5,7.9c-4.6,2.2-9.6,3.3-14.7,3.2c-5.7,0.1-11.4-1.2-16.5-4c-4.5-2.5-8.2-6.3-10.7-10.9s-3.8-9.8-3.7-15.1v-2.2 c-0.1-5.7,1.1-11.3,3.5-16.5c2.2-4.7,5.7-8.6,10.1-11.3c4.7-2.8,10.1-4.2,15.5-4.1c5.2-0.1,10.3,1.1,14.9,3.7 c4.1,2.5,7.4,6.2,9.4,10.5c2.2,5.1,3.3,10.5,3.2,16.1V56.8z M216.1,43.9c0.1-2.9-0.9-5.7-2.8-7.9c-1.8-1.9-4.4-2.9-7.9-2.9 c-2.9-0.1-5.8,1.1-7.7,3.2c-2,2.6-3.3,5.7-3.6,9h22V43.9z"></path><path d="M128.3,67.9h36.1v14.7h-56.9V72l35.8-54.3h-36.2V2.9h56.6v10.4L128.3,67.9z"></path><path d="M248.8,50.7c0-19.1,12.7-29.2,28.2-29.2s27.9,10.1,27.9,29.2V82h-16V51.4c0-10.6-4.8-16.1-12-16.1s-12.4,5.5-12.4,16.1 v30.7h-15.8L248.8,50.7L248.8,50.7z"></path><path d="M320.3,50.7c0-19.1,12.7-29.2,28.2-29.2s27.9,10.1,27.9,29.2V82h-16V51.4c0-10.6-4.8-16.1-12-16.1S336,40.8,336,51.4v30.7 h-15.8L320.3,50.7L320.3,50.7z"></path></g><path fill="#3EA8FF" class="st0" d="M2.4,83.3h17c0.9,0,1.7-0.5,2.2-1.2L68.4,5.2C69,4.2,68.3,3,67.1,3H51c-0.8,0-1.5,0.4-1.9,1.1L1.6,81.9 C1.3,82.5,1.7,83.3,2.4,83.3z"></path><path fill="#3EA8FF" class="st0" d="M61,82.1l22.1-35.5c0.7-1.1-0.1-2.5-1.4-2.5H65.7c-0.6,0-1.2,0.3-1.5,0.8L41.5,81.2c-0.6,0.9,0.1,2.1,1.2,2.1 h16.3C59.8,83.3,60.6,82.9,61,82.1z"></path></svg></a><p class="AppFooter_siteDescription__NWGP2">エンジニアのための<br aria-hidden="true"/>情報共有コミュニティ</p></div><div class="AppFooter_navColumns__ahV9g"><nav class="AppFooter_navColumn__47qTk"><h4 class="AppFooter_navColumnTitle__vVeiQ">About</h4><ul><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/about">Zennについて</a></div></li><li><a href="https://classmethod.jp" rel="nofollow noopener noreferrer" target="_blank">運営会社</a></li><li><a href="https://info.zenn.dev" rel="nofollow noopener noreferrer" target="_blank">お知らせ・リリース</a></li><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/events">イベント</a></div></li></ul></nav><nav class="AppFooter_navColumn__47qTk"><h4 class="AppFooter_navColumnTitle__vVeiQ">Guides</h4><ul><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/manual">使い方</a></div></li><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/biz-lp">法人向けメニュー</a><span class="NewLabel_newLabelSecondary__6Iy_T NewLabel_newLabel__Xva_r">New</span></div></li><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/publications">Publication / Pro</a></div></li><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/faq">よくある質問</a></div></li></ul></nav><nav class="AppFooter_navColumn__47qTk"><h4 class="AppFooter_navColumnTitle__vVeiQ">Links</h4><ul><li><a href="https://twitter.com/zenn_dev" rel="nofollow noopener noreferrer" target="_blank">X(Twitter)</a></li><li><a href="https://github.com/zenn-dev" rel="nofollow noopener noreferrer" target="_blank">GitHub</a></li><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/mediakit">メディアキット</a></div></li></ul></nav><nav class="AppFooter_navColumn__47qTk"><h4 class="AppFooter_navColumnTitle__vVeiQ">Legal</h4><ul><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/terms">利用規約</a></div></li><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/privacy">プライバシーポリシー</a></div></li><li><div style="display:flex;gap:0.5rem;flex-direction:row;align-items:center;flex-wrap:nowrap"><a href="/terms/transaction-law">特商法表記</a></div></li></ul></nav></div></div><div class="AppFooter_copyright__J_Jbe" data-nosnippet="true"><div class="AnnouncementPopUp_hiddenWrapper__ThDWT"><div aria-hidden="true" class="PopUp_popup__lIgfz AnnouncementPopUp_container__Pb5q2"><div></div></div></div><a href="https://classmethod.jp/"><svg width="115" height="25" viewBox="0 0 271 60" class="AppFooter_classmethodLogo__1RFuV"><g clip-path="url(#clip0_2_90)"><path d="M264.636 3.1617C265.475 3.16128 266.279 3.49393 266.873 4.08651C267.466 4.67909 267.8 5.48309 267.801 6.32175V53.0703C267.801 53.5668 267.703 54.0585 267.513 54.5171C267.323 54.9758 267.045 55.3926 266.694 55.7437C266.342 56.0948 265.926 56.3733 265.467 56.5633C265.008 56.7533 264.517 56.851 264.02 56.851H6.93132C5.92861 56.851 4.96697 56.4527 4.25795 55.7437C3.54892 55.0347 3.1506 54.073 3.1506 53.0703V6.93139C3.1506 5.92868 3.54892 4.96704 4.25795 4.25802C4.96697 3.54899 5.92861 3.15067 6.93132 3.15067H225.05C225.137 3.15178 225.221 3.17053 225.3 3.20575C225.379 3.24097 225.449 3.29192 225.507 3.35546L233.987 12.76C234.28 13.0541 234.628 13.2875 235.011 13.4467C235.394 13.606 235.805 13.688 236.22 13.688C236.635 13.688 237.046 13.606 237.429 13.4467C237.812 13.2875 238.16 13.0541 238.453 12.76L246.933 3.35231C246.99 3.28778 247.061 3.23616 247.139 3.20086C247.218 3.16555 247.303 3.14737 247.39 3.14752H264.65L264.636 3.1617ZM264.636 7.17095e-05H246.1C246.006 0.000759845 245.914 0.0205359 245.828 0.058188C245.742 0.0958401 245.665 0.150575 245.602 0.219038L237.347 9.39516C237.334 9.40757 237.323 9.4213 237.313 9.43611C237.018 9.72475 236.622 9.88642 236.21 9.88642C235.797 9.88642 235.402 9.72475 235.107 9.43611C235.094 9.42426 235.082 9.41045 235.072 9.39516L226.843 0.239517C226.775 0.162922 226.691 0.10189 226.597 0.0605717C226.504 0.0192536 226.402 -0.00138099 226.3 7.17095e-05H6.93132C5.09302 7.17095e-05 3.33001 0.730333 2.03014 2.03021C0.730262 3.33008 0 5.09309 0 6.93139L0 53.0719C0 54.9102 0.730262 56.6732 2.03014 57.9731C3.33001 59.273 5.09302 60.0032 6.93132 60.0032H264.02C265.858 60.0032 267.621 59.273 268.921 57.9731C270.221 56.6732 270.951 54.9102 270.951 53.0719V6.31545C270.949 4.64114 270.283 3.03602 269.099 1.8521C267.916 0.668193 266.31 0.00215599 264.636 7.17095e-05Z" fill="currentColor"></path><path d="M29.9653 23.8863C30.0769 23.8895 30.1866 23.9161 30.2873 23.9644C30.388 24.0126 30.4775 24.0813 30.5501 24.1662C30.6226 24.251 30.6766 24.3501 30.7086 24.4571C30.7405 24.5641 30.7498 24.6766 30.7357 24.7873V28.3554C30.7357 28.85 30.4679 29.0769 29.9653 29.0769H22.6008C18.4483 29.0769 17.815 30.4789 17.815 36.1657C17.815 41.8526 18.4452 43.2546 22.6008 43.2546H29.9653C30.46 43.2546 30.7357 43.4814 30.7357 43.9351V47.5457C30.7357 48.0892 30.4679 48.316 29.9653 48.4058C27.5367 48.8538 25.0703 49.0648 22.6008 49.0359C12.982 49.0359 11.1326 46.1074 11.1326 36.1642C11.1326 26.2209 12.982 23.2955 22.6008 23.2955C25.0681 23.2729 27.5325 23.4684 29.9653 23.88" fill="currentColor"></path><path d="M39.7637 17.6701C39.9308 17.6701 40.0911 17.7365 40.2093 17.8547C40.3274 17.9729 40.3938 18.1331 40.3938 18.3003V48.4389C40.3938 48.606 40.3274 48.7663 40.2093 48.8845C40.0911 49.0026 39.9308 49.069 39.7637 49.069H34.3872C34.297 49.0756 34.2064 49.0626 34.1216 49.0311C34.0368 48.9995 33.9598 48.9501 33.8959 48.8861C33.8319 48.8221 33.7825 48.7452 33.7509 48.6604C33.7193 48.5756 33.7064 48.485 33.713 48.3948V18.3444C33.7069 18.258 33.7187 18.1713 33.7477 18.0897C33.7766 18.008 33.8221 17.9333 33.8812 17.87C33.9403 17.8067 34.0118 17.7563 34.0913 17.722C34.1708 17.6876 34.2565 17.6699 34.3431 17.6701H39.7637Z" fill="currentColor"></path><path d="M169.156 35.857V37.259C169.156 38.1601 168.888 38.4767 167.711 38.4767H153.533C153.623 42.541 154.709 43.2624 158.636 43.2624H166.679C166.846 43.2624 167.007 43.3288 167.125 43.447C167.243 43.5652 167.309 43.7255 167.309 43.8926V47.6733C167.32 47.8536 167.26 48.031 167.143 48.1682C167.025 48.3054 166.859 48.3917 166.679 48.409C163.831 48.8866 160.945 49.0976 158.058 49.0391C148.934 49.0391 146.716 46.4682 146.716 36.1673C146.716 25.8664 148.929 23.2955 158.058 23.2955C167.187 23.2955 169.121 25.7782 169.168 35.8507M153.499 34.0911H162.437C162.388 30.1166 161.35 29.0785 158.058 29.0785C154.765 29.0785 153.54 30.1166 153.499 34.0911Z" fill="currentColor"></path><path d="M183.056 43.2908H181.07C179.083 43.2908 178.409 42.9332 178.409 40.9877V29.1462H182.706C182.793 29.1449 182.879 29.126 182.958 29.0907C183.037 29.0553 183.108 29.0043 183.167 28.9407C183.226 28.8771 183.271 28.8022 183.3 28.7206C183.33 28.6389 183.342 28.5522 183.337 28.4657V23.9556C183.337 23.7885 183.27 23.6282 183.152 23.51C183.034 23.3919 182.874 23.3255 182.706 23.3255H178.409V18.2971C178.407 18.1308 178.339 17.9719 178.222 17.8542C178.104 17.7366 177.945 17.6694 177.779 17.667H172.355C172.188 17.667 172.028 17.7334 171.91 17.8515C171.791 17.9697 171.725 18.13 171.725 18.2971V43.242C171.725 48.9351 175.246 49.0706 179.129 49.0706C180.418 49.0631 181.704 48.927 182.966 48.6642C183.072 48.6611 183.176 48.6356 183.272 48.5893C183.367 48.5431 183.452 48.4772 183.52 48.396C183.588 48.3147 183.638 48.22 183.667 48.118C183.696 48.0159 183.703 47.909 183.688 47.804V43.9635C183.694 43.8771 183.682 43.7904 183.653 43.7088C183.624 43.6271 183.579 43.5524 183.52 43.4891C183.46 43.4258 183.389 43.3754 183.309 43.3411C183.23 43.3067 183.144 43.289 183.058 43.2892" fill="currentColor"></path><path d="M234.551 36.1673C234.551 46.4603 232.426 49.0391 223.034 49.0391C213.642 49.0391 211.517 46.4682 211.517 36.1673C211.517 25.8664 213.683 23.2955 223.034 23.2955C232.385 23.2955 234.551 25.8255 234.551 36.1673ZM218.248 36.1673C218.248 41.8604 218.922 43.2561 223.034 43.2561C227.146 43.2561 227.867 41.8541 227.867 36.1673C227.867 30.4805 227.193 29.0785 223.034 29.0785C218.875 29.0785 218.248 30.4805 218.248 36.1673Z" fill="currentColor"></path><path d="M202.566 23.6594C199.378 23.1745 196.135 23.1745 192.947 23.6594V18.3034C192.947 18.1363 192.881 17.976 192.763 17.8578C192.645 17.7397 192.484 17.6733 192.317 17.6733H186.892C186.725 17.6733 186.565 17.7397 186.446 17.8578C186.328 17.976 186.262 18.1363 186.262 18.3034V48.3916C186.256 48.478 186.268 48.5647 186.296 48.6463C186.325 48.7279 186.371 48.8027 186.43 48.866C186.489 48.9292 186.561 48.9796 186.64 49.014C186.72 49.0484 186.805 49.0661 186.892 49.0658H192.317C192.484 49.0658 192.645 48.9995 192.763 48.8813C192.881 48.7631 192.947 48.6028 192.947 48.4357V36.1799C192.947 31.3595 193.084 29.091 197.733 29.091C202.382 29.091 202.595 31.3595 202.566 36.1799C202.538 41.0633 202.566 48.4389 202.566 48.4389C202.566 48.606 202.633 48.7663 202.751 48.8844C202.869 49.0026 203.029 49.069 203.196 49.069H208.622C208.789 49.069 208.949 49.0026 209.067 48.8844C209.185 48.7663 209.252 48.606 209.252 48.4389V34.9212C209.252 27.6749 207.505 24.5526 202.568 23.6515" fill="currentColor"></path><path d="M259.138 17.4449C259.225 17.4388 259.311 17.4506 259.393 17.4796C259.475 17.5085 259.549 17.554 259.613 17.6131C259.676 17.6722 259.726 17.7437 259.761 17.8232C259.795 17.9027 259.813 17.9884 259.813 18.075V46.3831C259.813 47.6686 259.269 48.1301 257.922 48.3775C254.951 48.8327 251.949 49.065 248.943 49.0722C239.241 49.0722 237.248 46.4036 237.248 36.2492C237.248 26.0948 239.255 23.3318 248.236 23.3318C249.877 23.2852 251.519 23.3748 253.146 23.5996V18.1112C253.142 18.026 253.155 17.9408 253.185 17.8609C253.215 17.781 253.261 17.7079 253.32 17.6462C253.378 17.5845 253.449 17.5354 253.528 17.5018C253.606 17.4683 253.691 17.4511 253.776 17.4512H259.16L259.138 17.4449ZM249.066 29.1541C244.507 29.1541 243.916 30.3718 243.916 36.202C243.916 41.5312 244.28 43.2908 248.792 43.2908C250.241 43.2893 251.689 43.184 253.124 42.9758V29.1541H249.06H249.066Z" fill="currentColor"></path><path d="M64.2297 31.0649V45.7656C64.2297 47.4496 63.9887 48.0624 61.5343 48.42C58.703 48.8207 55.8479 49.0312 52.9883 49.0501C46.2146 49.0501 42.6465 48.3286 42.5583 41.2398C42.6481 35.1402 45.8113 33.8359 51.8132 33.8359H57.5473V32.2007C57.5473 30.0347 56.1925 29.0863 53.305 29.0863H45.1245C45.0383 29.0852 44.9532 29.0666 44.8744 29.0318C44.7955 28.9969 44.7245 28.9465 44.6657 28.8835C44.6068 28.8205 44.5613 28.7463 44.5319 28.6653C44.5024 28.5843 44.4897 28.4982 44.4943 28.4121V24.7054C44.4901 24.5281 44.5524 24.3557 44.6691 24.2221C44.7857 24.0886 44.9482 24.0036 45.1245 23.9839C47.8206 23.4882 50.5594 23.2624 53.3003 23.3097C62.2385 23.3097 64.2265 26.1972 64.2265 31.0791M52.3519 38.1679C50.0079 38.1679 49.1887 38.9383 49.1887 40.8774C49.1887 43.0435 50.1796 43.6342 53.0262 43.6342C54.544 43.6442 56.0586 43.4921 57.5441 43.1805V38.1679H52.3519Z" fill="currentColor"></path><path d="M76.6021 28.7618H85.994C86.0798 28.7594 86.1643 28.7399 86.2425 28.7047C86.3207 28.6694 86.3912 28.6189 86.4498 28.5562C86.5084 28.4935 86.554 28.4198 86.5839 28.3394C86.6138 28.2589 86.6275 28.1733 86.6241 28.0876V24.4093C86.6093 24.2197 86.5198 24.0437 86.3754 23.92C86.231 23.7963 86.0434 23.7349 85.8538 23.7492C84.3069 23.5917 82.2023 23.4263 79.6928 23.3507C78.9572 23.316 76.7643 23.3018 76.5028 23.3018C70.7688 23.3018 66.8841 24.6566 66.8841 29.896V30.6175C66.7176 32.1302 67.1073 33.6518 67.9804 34.8983C68.8535 36.1447 70.1503 37.0309 71.6289 37.3913C71.6289 37.3913 78.0167 39.6802 78.2231 39.7637C79.2202 40.2583 79.5778 40.8018 79.5778 41.4965V41.9502C79.5778 43.0781 78.6768 43.5728 76.9171 43.5728H67.5268C67.4402 43.5743 67.3548 43.5934 67.2757 43.6287C67.1966 43.6641 67.1255 43.7151 67.0667 43.7787C67.0078 43.8423 66.9624 43.9171 66.9332 43.9986C66.904 44.0802 66.8916 44.1668 66.8967 44.2533V47.9111C66.8967 48.1317 67.0337 48.5413 67.667 48.5854C69.2139 48.7429 71.3185 48.9083 73.828 48.9839C74.5636 49.0186 76.7565 49.0328 77.018 49.0328C82.752 49.0328 86.6367 47.678 86.6367 42.4386V41.7171C86.8032 40.2044 86.4135 38.6828 85.5404 37.4363C84.6673 36.1898 83.3705 35.3037 81.8919 34.9433C81.8919 34.9433 75.5041 32.6544 75.2977 32.5709C74.3006 32.0762 73.943 31.5328 73.943 30.8381V30.3749C73.943 29.247 74.8441 28.746 76.6037 28.746" fill="currentColor"></path><path d="M98.7933 28.7618H108.185C108.271 28.7594 108.355 28.7399 108.434 28.7047C108.512 28.6694 108.582 28.6189 108.641 28.5562C108.7 28.4935 108.745 28.4198 108.775 28.3394C108.805 28.2589 108.819 28.1733 108.815 28.0876V24.4093C108.8 24.2197 108.711 24.0437 108.567 23.92C108.422 23.7963 108.235 23.7349 108.045 23.7492C106.498 23.5917 104.394 23.4263 101.884 23.3507C101.148 23.316 98.9556 23.3018 98.6941 23.3018C92.96 23.3018 89.0753 24.6566 89.0753 29.896V30.6175C88.9088 32.1302 89.2985 33.6518 90.1716 34.8983C91.0447 36.1447 92.3416 37.0309 93.8201 37.3913C93.8201 37.3913 100.208 39.6802 100.414 39.7637C101.411 40.2583 101.769 40.8018 101.769 41.4965V41.9502C101.769 43.0781 100.868 43.5728 99.1084 43.5728H89.7164C89.6298 43.5743 89.5444 43.5934 89.4653 43.6287C89.3863 43.6641 89.3152 43.7151 89.2563 43.7787C89.1974 43.8423 89.1521 43.9171 89.1229 43.9986C89.0936 44.0802 89.0812 44.1668 89.0863 44.2533V47.9111C89.0863 48.1317 89.2234 48.5413 89.8566 48.5854C91.4036 48.7429 93.5082 48.9083 96.0176 48.9839C96.7533 49.0186 98.9461 49.0328 99.2076 49.0328C104.942 49.0328 108.826 47.678 108.826 42.4386V41.7171C108.993 40.2044 108.603 38.6828 107.73 37.4363C106.857 36.1898 105.56 35.3037 104.082 34.9433C104.082 34.9433 97.6938 32.6544 97.4874 32.5709C96.4902 32.0762 96.1326 31.5328 96.1326 30.8381V30.3749C96.1326 29.247 97.0337 28.746 98.7933 28.746" fill="currentColor"></path><path d="M134.345 23.316C132.106 23.1876 129.868 23.5785 127.806 24.4581C125.75 23.5559 123.506 23.164 121.267 23.316C113.181 23.316 111.386 25.7987 111.386 34.132V48.4137C111.386 48.5808 111.453 48.7411 111.571 48.8592C111.689 48.9774 111.849 49.0438 112.016 49.0438H117.428C117.594 49.0413 117.753 48.9742 117.87 48.8565C117.988 48.7389 118.055 48.58 118.058 48.4137V35.8444C118.058 30.3781 118.573 29.3195 121.262 29.1194C123.765 29.3053 124.384 30.2394 124.46 34.7905V48.4042C124.466 48.5709 124.536 48.7289 124.655 48.846C124.773 48.9631 124.932 49.0304 125.099 49.0343H130.51C130.678 49.0343 130.838 48.9679 130.956 48.8498C131.074 48.7316 131.141 48.5713 131.141 48.4042V34.7921C131.216 30.2331 131.835 29.299 134.338 29.1131C137.027 29.3132 137.543 30.3733 137.543 35.8381V48.4074C137.543 48.5745 137.609 48.7348 137.727 48.8529C137.845 48.9711 138.006 49.0375 138.173 49.0375H143.584C143.75 49.0354 143.909 48.9684 144.027 48.8507C144.145 48.7329 144.212 48.5738 144.214 48.4074V34.1257C144.214 25.7924 142.461 23.3097 134.334 23.3097" fill="currentColor"></path></g><defs><clipPath id="clip0_2_90"><rect width="270.951" height="60" fill="white"></rect></clipPath></defs></svg></a></div></div></footer><div id="modal-portal"></div></div><script id="__NEXT_DATA__" type="application/json" nonce="69iEaJdwMppUPD7pEj6Czb10hCjhcz+NW5YfBAevhoo=">{"props":{"pageProps":{"article":{"id":382429,"postType":"Article","title":"RAGの検索性能を90%も低下させるテキストの落とし穴","slug":"ff2c528acf6b04","commentsCount":0,"likedCount":109,"bookmarkedCount":46,"bodyLettersCount":3417,"articleType":"tech","emoji":"📉","isSuspendingPrivate":false,"publishedAt":"2025-03-18T10:16:43.487+09:00","bodyUpdatedAt":"2025-03-17T19:13:06.197+09:00","sourceRepoUpdatedAt":null,"pinned":false,"path":"/knowledgesense/articles/ff2c528acf6b04","bodyHtml":"\u003ch2 id=\"%E5%B0%8E%E5%85%A5\" data-line=\"0\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#%E5%B0%8E%E5%85%A5\" aria-hidden=\"true\"\u003e\u003c/a\u003e 導入\u003c/h2\u003e\n\u003cp data-line=\"1\" class=\"code-line\"\u003eこんにちは、株式会社ナレッジセンスの須藤英寿です。\u003c/p\u003e\n\u003cp data-line=\"3\" class=\"code-line\"\u003e今回は、RAGの要であるEmbeddingの性能を大きく低下させてしまう、文章の特性について解説します。\u003cbr\u003e\nこのブログで紹介している内容は以下の論文を元に作成しておりますので、詳細はそちらをご確認ください。RAGを構成してみたが、どうしても正解の文章を取ってこれない!そんなときはもしかするとこの論文で紹介されているような文章になってしまっているかもしれません。\u003c/p\u003e\n\u003cp data-line=\"6\" class=\"code-line\"\u003e\u003cspan class=\"embed-block zenn-embedded zenn-embedded-card\"\u003e\u003ciframe id=\"zenn-embedded__8c27c04700ebe\" src=\"https://embed.zenn.studio/card#zenn-embedded__8c27c04700ebe\" data-content=\"https%3A%2F%2Farxiv.org%2Fpdf%2F2503.05037\" frameborder=\"0\" scrolling=\"no\" loading=\"lazy\"\u003e\u003c/iframe\u003e\u003c/span\u003e\u003ca href=\"https://arxiv.org/pdf/2503.05037\" style=\"display:none\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"\u003ehttps://arxiv.org/pdf/2503.05037\u003c/a\u003e\u003c/p\u003e\n\u003cp data-line=\"8\" class=\"code-line\"\u003e\u003cimg src=\"https://storage.googleapis.com/zenn-user-upload/10258576e44f-20250315.png\" loading=\"lazy\" class=\"md-img\"\u003e\u003c/p\u003e\n\u003ch2 id=\"%E3%82%B5%E3%83%9E%E3%83%AA%E3%83%BC\" data-line=\"10\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#%E3%82%B5%E3%83%9E%E3%83%AA%E3%83%BC\" aria-hidden=\"true\"\u003e\u003c/a\u003e サマリー\u003c/h2\u003e\n\u003cp data-line=\"12\" class=\"code-line\"\u003eEmbeddingは、RAGの検索能力の根幹に関わる機能ですが、そのの性能や特性についてはあまり知られてはいません。実は、保管するテキストの文体や分割方法次第で最大90%程度、検索性能が下がってしまいます。\u003c/p\u003e\n\u003cp data-line=\"14\" class=\"code-line\"\u003e今回紹介する論文では、Embeddingの性能を著しく下げるテキストの特徴を調べ、その性質についてまとめています。特に「文章の位置」、「使用する単語」、「文章量」による影響が大きいとされています。残念ながら具体的な対策は提示されていませんので、対応策の例として、これまでに紹介した記事をあわせて明記しておきました。そちらもぜひ、ご確認ください。\u003c/p\u003e\n\u003ch2 id=\"%E5%95%8F%E9%A1%8C%E6%84%8F%E8%AD%98\" data-line=\"17\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#%E5%95%8F%E9%A1%8C%E6%84%8F%E8%AD%98\" aria-hidden=\"true\"\u003e\u003c/a\u003e 問題意識\u003c/h2\u003e\n\u003ch3 id=\"embedding%E3%81%AE%E7%89%B9%E5%BE%B4%E3%81%A8%E6%95%B0%E5%80%A4%E7%9A%84%E3%81%AA%E6%84%8F%E5%91%B3\" data-line=\"19\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#embedding%E3%81%AE%E7%89%B9%E5%BE%B4%E3%81%A8%E6%95%B0%E5%80%A4%E7%9A%84%E3%81%AA%E6%84%8F%E5%91%B3\" aria-hidden=\"true\"\u003e\u003c/a\u003e Embeddingの特徴と数値的な意味\u003c/h3\u003e\n\u003cp data-line=\"21\" class=\"code-line\"\u003eEmbeddingがどういったものかおさらいします。Embeddingは、主に数百文字程度の文章を数百次元以上のベクトルデータに置き換える機能を有しています。そして、このベクトルデータには以下のような特徴があります。\u003c/p\u003e\n\u003cul data-line=\"23\" class=\"code-line\"\u003e\n\u003cli data-line=\"23\" class=\"code-line\"\u003eベクトルデータは、文章の意味を表している\u003c/li\u003e\n\u003cli data-line=\"24\" class=\"code-line\"\u003eベクトルデータの内積が1に近いほど2つの文章の意味は類似している\u003c/li\u003e\n\u003c/ul\u003e\n\u003cp data-line=\"26\" class=\"code-line\"\u003e厳密には異なる特性を持つものもありますが、RAGについて考えるうえではこの理解で十分です。特に内積を取るだけで文章の類似度がわかるという性質のお陰で、EmbeddingはRAGの主要な検索機能としての役割を果たしています。\u003c/p\u003e\n\u003ch3 id=\"embedding%E3%81%AE%E9%99%90%E7%95%8C\" data-line=\"28\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#embedding%E3%81%AE%E9%99%90%E7%95%8C\" aria-hidden=\"true\"\u003e\u003c/a\u003e Embeddingの限界\u003c/h3\u003e\n\u003cp data-line=\"30\" class=\"code-line\"\u003eしかし、Embeddingにも限界があり一般的には「固有名詞に弱い」と言われています。これは、Embeddingの学習内で利用されてこなかった単語の意味をEmbeddingが理解できないためです。これは一つの例ですが、論文内では更に以下の要素によってEmbeddingの性能が下がると言及されています。\u003c/p\u003e\n\u003cul data-line=\"32\" class=\"code-line\"\u003e\n\u003cli data-line=\"32\" class=\"code-line\"\u003e位置バイアス: 重要な情報が文章内のどの位置にあるか\u003c/li\u003e\n\u003cli data-line=\"33\" class=\"code-line\"\u003e単語バイアス: 意味と関係なく同じ単語が含まれているか\u003c/li\u003e\n\u003cli data-line=\"34\" class=\"code-line\"\u003e文章量バイアス: 文章量が多いか少ないか\u003c/li\u003e\n\u003c/ul\u003e\n\u003ch2 id=\"embedding%E3%81%AE%E6%80%A7%E8%83%BD%E3%82%92%E5%BC%95%E3%81%8D%E4%B8%8B%E3%81%92%E3%82%8B%E6%96%87%E7%AB%A0%E3%81%AE%E7%89%B9%E5%BE%B4\" data-line=\"36\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#embedding%E3%81%AE%E6%80%A7%E8%83%BD%E3%82%92%E5%BC%95%E3%81%8D%E4%B8%8B%E3%81%92%E3%82%8B%E6%96%87%E7%AB%A0%E3%81%AE%E7%89%B9%E5%BE%B4\" aria-hidden=\"true\"\u003e\u003c/a\u003e Embeddingの性能を引き下げる文章の特徴\u003c/h2\u003e\n\u003cp data-line=\"38\" class=\"code-line\"\u003eここでは、各種の文章の傾向によって生じる文章間の類似度の低下についてまとめています。比較に使用されている数値は、割合を表すものではなくt検定と呼ばれる比較手法で得られる値(t値)で、値が大きいほどずれが大きいことを示しています。\u003c/p\u003e\n\u003ch3 id=\"%E4%BD%8D%E7%BD%AE%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9\" data-line=\"40\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#%E4%BD%8D%E7%BD%AE%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9\" aria-hidden=\"true\"\u003e\u003c/a\u003e 位置バイアス\u003c/h3\u003e\n\u003cp data-line=\"42\" class=\"code-line\"\u003e\u003cimg src=\"https://storage.googleapis.com/zenn-user-upload/ca2f7e36fe8f-20250317.png\" loading=\"lazy\" class=\"md-img\"\u003e\u003c/p\u003e\n\u003cp data-line=\"44\" class=\"code-line\"\u003e入力したクエリに対する回答が、どの位置にあるかによって類似度がどのように変化するかを示したグラフです。文章の先頭にある状態が最も類似度の高い状態で、そこから減少していきモデルごとに最も性能が下がるポイントは異なりますが、誤差の範囲の7~20倍ものズレが発生していまうことを示しています。\u003c/p\u003e\n\u003cp data-line=\"46\" class=\"code-line\"\u003e【対抗策】\u003c/p\u003e\n\u003cul data-line=\"47\" class=\"code-line\"\u003e\n\u003cli data-line=\"47\" class=\"code-line\"\u003e\u003ca href=\"https://zenn.dev/knowledgesense/articles/fe155b25510683\" target=\"_blank\"\u003eRAGで人間の脳を再現する\u003c/a\u003e\u003c/li\u003e\n\u003cli data-line=\"48\" class=\"code-line\"\u003e\u003ca href=\"https://zenn.dev/knowledgesense/articles/077ad1ab0f9ff6\" target=\"_blank\"\u003eRAGの「ベクトル検索」の弱みを、ナレッジグラフで補う\u003c/a\u003e\u003c/li\u003e\n\u003cli data-line=\"49\" class=\"code-line\"\u003e\u003ca href=\"https://zenn.dev/knowledgesense/articles/71e3d2d2ff3858\" target=\"_blank\"\u003eRAGのチャンク化を最適化する「RAPTOR」について\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003ch3 id=\"%E5%8D%98%E8%AA%9E%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9\" data-line=\"52\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#%E5%8D%98%E8%AA%9E%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9\" aria-hidden=\"true\"\u003e\u003c/a\u003e 単語バイアス\u003c/h3\u003e\n\u003cp data-line=\"54\" class=\"code-line\"\u003e\u003cimg src=\"https://storage.googleapis.com/zenn-user-upload/17e688ba21f5-20250317.png\" loading=\"lazy\" class=\"md-img\"\u003e\u003c/p\u003e\n\u003cp data-line=\"56\" class=\"code-line\"\u003e意味は同じだが、表記が異なる(例えば\"US\"と\"United States\")とドキュメントの類似度に誤差の範囲の20倍以上の差が生まれます。たとえば、「United States」を含むクエリで検索した場合、対象のドキュメントが「United States」なのか「US」なのかで、大きな差が生まれることになります。\u003c/p\u003e\n\u003cp data-line=\"58\" class=\"code-line\"\u003e【対抗策】\u003c/p\u003e\n\u003cul data-line=\"59\" class=\"code-line\"\u003e\n\u003cli data-line=\"59\" class=\"code-line\"\u003e\u003ca href=\"https://zenn.dev/knowledgesense/articles/fe155b25510683\" target=\"_blank\"\u003eRAGで人間の脳を再現する\u003c/a\u003e\u003c/li\u003e\n\u003cli data-line=\"60\" class=\"code-line\"\u003e\u003ca href=\"https://zenn.dev/knowledgesense/articles/077ad1ab0f9ff6\" target=\"_blank\"\u003eRAGの「ベクトル検索」の弱みを、ナレッジグラフで補う\u003c/a\u003e\u003c/li\u003e\n\u003cli data-line=\"61\" class=\"code-line\"\u003e\u003ca href=\"https://zenn.dev/knowledgesense/articles/abf34c417b079e\" target=\"_blank\"\u003eRAGの精度と速度を同時に向上「DIVA」による曖昧さ対策\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003ch3 id=\"%E6%96%87%E7%AB%A0%E9%87%8F%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9\" data-line=\"63\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#%E6%96%87%E7%AB%A0%E9%87%8F%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9\" aria-hidden=\"true\"\u003e\u003c/a\u003e 文章量バイアス\u003c/h3\u003e\n\u003cp data-line=\"64\" class=\"code-line\"\u003e\u003cimg src=\"https://storage.googleapis.com/zenn-user-upload/28b98b6f8610-20250317.png\" loading=\"lazy\" class=\"md-img\"\u003e\u003c/p\u003e\n\u003cp data-line=\"66\" class=\"code-line\"\u003e保管されている文章の長さによる類似度の比較結果です。この結果は正解となる文章だけの場合と、正解とは無関係な文章を含んだ場合で、類似度にどの程度差が生まれるかを示しています。\u003c/p\u003e\n\u003cp data-line=\"69\" class=\"code-line\"\u003e【対抗策】\u003c/p\u003e\n\u003cul data-line=\"70\" class=\"code-line\"\u003e\n\u003cli data-line=\"70\" class=\"code-line\"\u003e\u003ca href=\"https://zenn.dev/knowledgesense/articles/fe155b25510683\" target=\"_blank\"\u003eRAGで人間の脳を再現する\u003c/a\u003e\u003c/li\u003e\n\u003cli data-line=\"71\" class=\"code-line\"\u003e\u003ca href=\"https://zenn.dev/knowledgesense/articles/71e3d2d2ff3858\" target=\"_blank\"\u003eRAGのチャンク化を最適化する「RAPTOR」について\u003c/a\u003e\u003c/li\u003e\n\u003cli data-line=\"72\" class=\"code-line\"\u003e\u003ca href=\"https://zenn.dev/knowledgesense/articles/e0ade68c265200\" target=\"_blank\"\u003eRAGの「文脈が消える問題」を解決する「LongRAG」\u003c/a\u003e\u003c/li\u003e\n\u003c/ul\u003e\n\u003ch3 id=\"%E7%B2%BE%E5%BA%A6%E3%81%B8%E3%81%AE%E5%BD%B1%E9%9F%BF\" data-line=\"75\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#%E7%B2%BE%E5%BA%A6%E3%81%B8%E3%81%AE%E5%BD%B1%E9%9F%BF\" aria-hidden=\"true\"\u003e\u003c/a\u003e 精度への影響\u003c/h3\u003e\n\u003cp data-line=\"77\" class=\"code-line\"\u003e\u003cimg src=\"https://storage.googleapis.com/zenn-user-upload/6b51b8f595b4-20250317.png\" loading=\"lazy\" class=\"md-img\"\u003e\u003c/p\u003e\n\u003cp data-line=\"79\" class=\"code-line\"\u003e正解を含むが関係のない文章も含む正解ドキュメントと、正解を含まないがEmbeddingが一致していると導きたくなる特徴を持つ不正解ドキュメントを比較して、正解ドキュメントを類似していると判断できるかを実験しています。結果として、圧倒的に不正解ドキュメントを取得する傾向にあり、正解ドキュメントを選べる確率はほぼ0%となっています。\u003c/p\u003e\n\u003ch2 id=\"%E3%81%BE%E3%81%A8%E3%82%81\" data-line=\"81\" class=\"code-line\"\u003e\n\u003ca class=\"header-anchor-link\" href=\"#%E3%81%BE%E3%81%A8%E3%82%81\" aria-hidden=\"true\"\u003e\u003c/a\u003e まとめ\u003c/h2\u003e\n\u003cp data-line=\"83\" class=\"code-line\"\u003eドキュメントの特徴による、Embedding性能への影響についてまとめた論文を紹介しました。最後の「精度の影響」に記載されるような極端な例はそう多くは無いですが、これらのバイアスが反映されることで知らず知らずのうちにRAGの性能を引き下げることがあります。実際にRAGを運用していると、他にも、「意味的な区切りを無視して文章を分割してしまうことで意味が損なわれるケース」や、「パソコンの再起動」と「アプリの再起動」のように言葉としては似ているが、使われ方が全く違うような言葉が類似していると判定されてしまうといったケースがあげられます。汎用的に問題に対処するのは難しいので、自身のユースケースを見極めて、それにあった対処法を見つけることが重要です。対抗策に示された手法も、一つの対策になるのでぜひご活用ください。\u003c/p\u003e\n","ogImageUrl":"https://res.cloudinary.com/zenn/image/upload/s--1-ytxo17--/c_fit%2Cg_north_west%2Cl_text:notosansjp-medium.otf_55:RAG%25E3%2581%25AE%25E6%25A4%259C%25E7%25B4%25A2%25E6%2580%25A7%25E8%2583%25BD%25E3%2582%259290%25EF%25BC%2585%25E3%2582%2582%25E4%25BD%258E%25E4%25B8%258B%25E3%2581%2595%25E3%2581%259B%25E3%2582%258B%25E3%2583%2586%25E3%2582%25AD%25E3%2582%25B9%25E3%2583%2588%25E3%2581%25AE%25E8%2590%25BD%25E3%2581%25A8%25E3%2581%2597%25E7%25A9%25B4%2Cw_1010%2Cx_90%2Cy_100/g_south_west%2Cl_text:notosansjp-medium.otf_34:sasakuna%2Cx_220%2Cy_108/bo_3px_solid_rgb:d6e3ed%2Cg_south_west%2Ch_90%2Cl_fetch:aHR0cHM6Ly96ZW5uLmRldi9pbWFnZXMvZGVmYXVsdC1wdWJsaWNhdGlvbi1hdmF0YXIucG5n%2Cr_20%2Cw_90%2Cx_92%2Cy_102/co_rgb:6e7b85%2Cg_south_west%2Cl_text:notosansjp-medium.otf_30:%25E3%2583%258A%25E3%2583%25AC%25E3%2583%2583%25E3%2582%25B8%25E3%2582%25BB%25E3%2583%25B3%25E3%2582%25B9%2520-%2520AI%25E7%259F%25A5%25E8%25A6%258B%25E5%2585%25B1%25E6%259C%2589%25E3%2583%2596%25E3%2583%25AD%25E3%2582%25B0%2Cx_220%2Cy_160/bo_4px_solid_white%2Cg_south_west%2Ch_50%2Cl_fetch:aHR0cHM6Ly9zdG9yYWdlLmdvb2dsZWFwaXMuY29tL3plbm4tdXNlci11cGxvYWQvYXZhdGFyL2ZkMzU2MDA4YWUuanBlZw==%2Cr_max%2Cw_50%2Cx_139%2Cy_84/v1627283836/default/og-base-w1200-v2.png","toc":[{"id":"%E5%B0%8E%E5%85%A5","text":"導入","level":2,"children":[]},{"id":"%E3%82%B5%E3%83%9E%E3%83%AA%E3%83%BC","text":"サマリー","level":2,"children":[]},{"id":"%E5%95%8F%E9%A1%8C%E6%84%8F%E8%AD%98","text":"問題意識","level":2,"children":[{"id":"embedding%E3%81%AE%E7%89%B9%E5%BE%B4%E3%81%A8%E6%95%B0%E5%80%A4%E7%9A%84%E3%81%AA%E6%84%8F%E5%91%B3","text":"Embeddingの特徴と数値的な意味","level":3,"children":[]},{"id":"embedding%E3%81%AE%E9%99%90%E7%95%8C","text":"Embeddingの限界","level":3,"children":[]}]},{"id":"embedding%E3%81%AE%E6%80%A7%E8%83%BD%E3%82%92%E5%BC%95%E3%81%8D%E4%B8%8B%E3%81%92%E3%82%8B%E6%96%87%E7%AB%A0%E3%81%AE%E7%89%B9%E5%BE%B4","text":"Embeddingの性能を引き下げる文章の特徴","level":2,"children":[{"id":"%E4%BD%8D%E7%BD%AE%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9","text":"位置バイアス","level":3,"children":[]},{"id":"%E5%8D%98%E8%AA%9E%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9","text":"単語バイアス","level":3,"children":[]},{"id":"%E6%96%87%E7%AB%A0%E9%87%8F%E3%83%90%E3%82%A4%E3%82%A2%E3%82%B9","text":"文章量バイアス","level":3,"children":[]},{"id":"%E7%B2%BE%E5%BA%A6%E3%81%B8%E3%81%AE%E5%BD%B1%E9%9F%BF","text":"精度への影響","level":3,"children":[]}]},{"id":"%E3%81%BE%E3%81%A8%E3%82%81","text":"まとめ","level":2,"children":[]}],"tocEnabled":true,"shouldNoindex":false,"scheduledPublishAt":null,"canSendBadge":false,"status":"published","badges":[]},"user":{"id":157135,"username":"sasakuna","name":"sasakuna","avatarSmallUrl":"https://res.cloudinary.com/zenn/image/fetch/s--hkl_2ZEM--/c_limit%2Cf_auto%2Cfl_progressive%2Cq_auto%2Cw_70/https://storage.googleapis.com/zenn-user-upload/avatar/fd356008ae.jpeg","avatarUrl":"https://storage.googleapis.com/zenn-user-upload/avatar/fd356008ae.jpeg","bio":"Knowledge Sense, Inc. CTO","autolinkedBio":"Knowledge Sense, Inc. CTO","githubUsername":"","twitterUsername":"sasa_kuna_","isSupportOpen":false,"tokusyoContact":null,"tokusyoName":null,"websiteUrl":"","websiteDomain":null,"totalLikedCount":1150,"gaTrackingId":null,"hatenaId":null,"isInvoiceIssuer":false},"topics":[{"id":21335,"name":"embedding","taggingsCount":77,"imageUrl":"https://zenn.dev/images/topic.png","displayName":"embedding"},{"id":25950,"name":"rag","taggingsCount":672,"imageUrl":"https://zenn.dev/images/topic.png","displayName":"RAG"}],"isMine":false,"isPreview":false,"draftRevealScope":"private","githubRepository":null,"currentUserLiked":false,"currentUserBookmarked":false,"comments":[],"commentedUsers":[],"positiveCommentsCount":0,"publication":{"id":576,"name":"knowledgesense","displayName":"ナレッジセンス - AI知見共有ブログ","avatarSmallUrl":"https://zenn.dev/images/default-publication-avatar.png","avatarUrl":"https://zenn.dev/images/default-publication-avatar.png","pro":false,"avatarRegistered":false,"description":"株式会社ナレッジセンスは、「大企業の知的活動を最速にする」をミッションに掲げ、社内データ検索ができるAIチャットボットを開発・提供しているスタートアップです。このブログでは、LLMや検索技術、RAGの実装戦略などについて知見を共有していきます。","autolinkedDescription":"株式会社ナレッジセンスは、「大企業の知的活動を最速にする」をミッションに掲げ、社内データ検索ができるAIチャットボットを開発・提供しているスタートアップです。このブログでは、LLMや検索技術、RAGの実装戦略などについて知見を共有していきます。","twitterUsername":"","githubUsername":"","coverImageUrl":null,"fixedSentencesHtml":null,"isSupportOpen":true,"isArticleCommentOpen":true,"gaTrackingId":null}}},"page":"/[username]/articles/[slug]","query":{"username":"knowledgesense","slug":"ff2c528acf6b04"},"buildId":"K0KfDsWWmRmIshdNoQaRg","assetPrefix":"https://static.zenn.studio","isFallback":false,"isExperimentalCompile":false,"gip":true,"scriptLoader":[]}</script></body></html>

Pages: 1 2 3 4 5 6 7 8 9 10