CINXE.COM
<!doctype html><html lang="en"><head><title data-rh="true">Simple Chunking Strategies for RAG Applications (Part 1) | by kirouane Ayoub | Medium</title><meta data-rh="true" charset="utf-8"/><meta data-rh="true" name="viewport" content="width=device-width,minimum-scale=1,initial-scale=1,maximum-scale=1"/><meta data-rh="true" name="theme-color" content="#000000"/><meta data-rh="true" name="twitter:app:name:iphone" content="Medium"/><meta data-rh="true" name="twitter:app:id:iphone" content="828256236"/><meta data-rh="true" property="al:ios:app_name" content="Medium"/><meta data-rh="true" property="al:ios:app_store_id" content="828256236"/><meta data-rh="true" property="al:android:package" content="com.medium.reader"/><meta data-rh="true" property="fb:app_id" content="542599432471018"/><meta data-rh="true" property="og:site_name" content="Medium"/><meta data-rh="true" property="og:type" content="article"/><meta data-rh="true" property="article:published_time" content="2024-08-27T10:23:58.071Z"/><meta data-rh="true" name="title" content="Simple Chunking Strategies for RAG Applications (Part 1) | by kirouane Ayoub | Medium"/><meta data-rh="true" property="og:title" content="Simple Chunking Strategies for RAG Applications (Part 1)"/><meta data-rh="true" property="al:android:url" content="medium://p/d56903b167c5"/><meta data-rh="true" property="al:ios:url" content="medium://p/d56903b167c5"/><meta data-rh="true" property="al:android:app_name" content="Medium"/><meta data-rh="true" name="description" content="When building a RAG (Retrieval-Augmented Generation) system, chunking text into manageable segments is a crucial step. Chunking not only ensures that content is well-organized but also improves the…"/><meta data-rh="true" property="og:description" content="When building a RAG (Retrieval-Augmented Generation) system, chunking text into manageable segments is a crucial step. Chunking not only…"/><meta data-rh="true" property="og:url" content="https://medium.com/@ayoubkirouane3/simple-chunking-strategies-for-rag-applications-part-1-d56903b167c5"/><meta data-rh="true" property="al:web:url" content="https://medium.com/@ayoubkirouane3/simple-chunking-strategies-for-rag-applications-part-1-d56903b167c5"/><meta data-rh="true" property="article:author" content="https://medium.com/@ayoubkirouane3"/><meta data-rh="true" name="author" content="kirouane Ayoub"/><meta data-rh="true" name="robots" content="index,noarchive,follow,max-image-preview:large"/><meta data-rh="true" name="referrer" content="unsafe-url"/><meta data-rh="true" property="twitter:title" content="Simple Chunking Strategies for RAG Applications (Part 1)"/><meta data-rh="true" name="twitter:site" content="@Medium"/><meta data-rh="true" name="twitter:app:url:iphone" content="medium://p/d56903b167c5"/><meta data-rh="true" property="twitter:description" content="When building a RAG (Retrieval-Augmented Generation) system, chunking text into manageable segments is a crucial step. Chunking not only…"/><meta data-rh="true" name="twitter:card" content="summary"/><meta data-rh="true" name="twitter:label1" content="Reading time"/><meta data-rh="true" name="twitter:data1" content="5 min read"/><link data-rh="true" rel="icon" href="https://miro.medium.com/v2/5d8de952517e8160e40ef9841c781cdc14a5db313057fa3c3de41c6f5b494b19"/><link data-rh="true" rel="search" type="application/opensearchdescription+xml" title="Medium" href="/osd.xml"/><link data-rh="true" rel="apple-touch-icon" sizes="152x152" href="https://miro.medium.com/v2/resize:fill:304:304/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="apple-touch-icon" sizes="120x120" href="https://miro.medium.com/v2/resize:fill:240:240/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="apple-touch-icon" sizes="76x76" href="https://miro.medium.com/v2/resize:fill:152:152/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="apple-touch-icon" sizes="60x60" href="https://miro.medium.com/v2/resize:fill:120:120/10fd5c419ac61637245384e7099e131627900034828f4f386bdaa47a74eae156"/><link data-rh="true" rel="mask-icon" href="https://miro.medium.com/v2/resize:fill:1000:1000/7*GAOKVe--MXbEJmV9230oOQ.png" color="#171717"/><link data-rh="true" rel="preconnect" href="https://glyph.medium.com" crossOrigin=""/><link data-rh="true" id="glyph_preload_link" rel="preload" as="style" type="text/css" href="https://glyph.medium.com/css/unbound.css"/><link data-rh="true" id="glyph_link" rel="stylesheet" type="text/css" href="https://glyph.medium.com/css/unbound.css"/><link data-rh="true" rel="author" href="https://medium.com/@ayoubkirouane3"/><link data-rh="true" rel="canonical" href="https://medium.com/@ayoubkirouane3/simple-chunking-strategies-for-rag-applications-part-1-d56903b167c5"/><link data-rh="true" rel="alternate" href="android-app://com.medium.reader/https/medium.com/p/d56903b167c5"/><script data-rh="true" type="application/ld+json">{"@context":"http:\u002F\u002Fschema.org","@type":"NewsArticle","image":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fv2\u002Fda:true\u002Fbd978bb536350a710e8efb012513429cabdc4c28700604261aeda246d0f980b7","height":810,"width":1440},"url":"https:\u002F\u002Fmedium.com\u002F@ayoubkirouane3\u002Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5","dateCreated":"2024-08-27T10:23:58.071Z","datePublished":"2024-08-27T10:23:58.071Z","dateModified":"2024-11-17T16:50:03.528Z","headline":"Simple Chunking Strategies for RAG Applications (Part 1)","name":"Simple Chunking Strategies for RAG Applications (Part 1)","description":"When building a RAG (Retrieval-Augmented Generation) system, chunking text into manageable segments is a crucial step. Chunking not only ensures that content is well-organized but also improves the…","identifier":"d56903b167c5","author":{"@type":"Person","name":"kirouane Ayoub","url":"https:\u002F\u002Fmedium.com\u002F@ayoubkirouane3"},"creator":["kirouane Ayoub"],"publisher":{"@type":"Organization","name":"Medium","url":"https:\u002F\u002Fmedium.com\u002F","logo":{"@type":"ImageObject","width":272,"height":60,"url":"https:\u002F\u002Fmiro.medium.com\u002Fv2\u002Fresize:fit:544\u002F7*V1_7XP4snlmqrc_0Njontw.png"}},"mainEntityOfPage":"https:\u002F\u002Fmedium.com\u002F@ayoubkirouane3\u002Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5"}</script><style type="text/css" data-fela-rehydration="527" data-fela-type="STATIC">html{box-sizing:border-box;-webkit-text-size-adjust:100%}*, *:before, *:after{box-sizing:inherit}body{margin:0;padding:0;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;color:rgba(0,0,0,0.8);position:relative;min-height:100vh}h1, h2, h3, h4, h5, h6, dl, dd, ol, ul, menu, figure, blockquote, p, pre, form{margin:0}menu, ol, ul{padding:0;list-style:none;list-style-image:none}main{display:block}a{color:inherit;text-decoration:none}a, button, input{-webkit-tap-highlight-color:transparent}img, svg{vertical-align:middle}button{background:transparent;overflow:visible}button, input, optgroup, select, textarea{margin:0}:root{--reach-tabs:1;--reach-menu-button:1}#speechify-root{font-family:Sohne, sans-serif}div[data-popper-reference-hidden="true"]{visibility:hidden;pointer-events:none}.grecaptcha-badge{visibility:hidden} /*XCode style (c) Angel Garcia <angelgarcia.mail@gmail.com>*/.hljs {background: #fff;color: black; }/* Gray DOCTYPE selectors like WebKit */ .xml .hljs-meta {color: #c0c0c0; }.hljs-comment, .hljs-quote {color: #007400; }.hljs-tag, .hljs-attribute, .hljs-keyword, .hljs-selector-tag, .hljs-literal, .hljs-name {color: #aa0d91; }.hljs-variable, .hljs-template-variable {color: #3F6E74; }.hljs-code, .hljs-string, .hljs-meta .hljs-string {color: #c41a16; }.hljs-regexp, .hljs-link {color: #0E0EFF; }.hljs-title, .hljs-symbol, .hljs-bullet, .hljs-number {color: #1c00cf; }.hljs-section, .hljs-meta {color: #643820; }.hljs-title.class_, .hljs-class .hljs-title, .hljs-type, .hljs-built_in, .hljs-params {color: #5c2699; }.hljs-attr {color: #836C28; }.hljs-subst {color: #000; }.hljs-formula {background-color: #eee;font-style: italic; }.hljs-addition {background-color: #baeeba; }.hljs-deletion {background-color: #ffc8bd; }.hljs-selector-id, .hljs-selector-class {color: #9b703f; }.hljs-doctag, .hljs-strong {font-weight: bold; }.hljs-emphasis {font-style: italic; } </style><style type="text/css" data-fela-rehydration="527" data-fela-type="KEYFRAME">@-webkit-keyframes k1{0%{opacity:0.8}50%{opacity:0.5}100%{opacity:0.8}}@-moz-keyframes k1{0%{opacity:0.8}50%{opacity:0.5}100%{opacity:0.8}}@keyframes k1{0%{opacity:0.8}50%{opacity:0.5}100%{opacity:0.8}}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE">.a{font-family:medium-content-sans-serif-font, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, "Open Sans", "Helvetica Neue", sans-serif}.b{font-weight:400}.c{background-color:rgba(255, 255, 255, 1)}.l{display:block}.m{position:sticky}.n{top:0}.o{z-index:500}.p{padding:0 24px}.q{align-items:center}.r{border-bottom:solid 1px #F2F2F2}.y{height:41px}.z{line-height:20px}.ab{display:flex}.ac{height:57px}.ae{flex:1 0 auto}.af{color:inherit}.ag{fill:inherit}.ah{font-size:inherit}.ai{border:inherit}.aj{font-family:inherit}.ak{letter-spacing:inherit}.al{font-weight:inherit}.am{padding:0}.an{margin:0}.ao{cursor:pointer}.ap:disabled{cursor:not-allowed}.aq:disabled{color:#6B6B6B}.ar:disabled{fill:#6B6B6B}.au{width:auto}.av path{fill:#242424}.aw{height:25px}.ax{margin-left:16px}.ay{border:none}.az{border-radius:20px}.ba{width:240px}.bb{background:#F9F9F9}.bc path{fill:#6B6B6B}.be{outline:none}.bf{font-family:sohne, "Helvetica Neue", Helvetica, Arial, sans-serif}.bg{font-size:14px}.bh{width:100%}.bi{padding:10px 20px 10px 0}.bj{background-color:transparent}.bk{color:#242424}.bl::placeholder{color:#6B6B6B}.bm{display:inline-block}.bn{margin-left:12px}.bo{margin-right:12px}.bp{border-radius:4px}.bq{margin-left:24px}.br{height:24px}.bx{background-color:#F9F9F9}.by{border-radius:50%}.bz{height:32px}.ca{width:32px}.cb{justify-content:center}.ch{max-width:680px}.ci{min-width:0}.cj{animation:k1 1.2s ease-in-out infinite}.ck{height:100vh}.cl{margin-bottom:16px}.cm{margin-top:48px}.cn{align-items:flex-start}.co{flex-direction:column}.cp{justify-content:space-between}.cq{margin-bottom:24px}.cw{width:80%}.cx{background-color:#F2F2F2}.dd{height:44px}.de{width:44px}.df{margin:auto 0}.dg{margin-bottom:4px}.dh{height:16px}.di{width:120px}.dj{width:80px}.dp{margin-bottom:8px}.dq{width:96%}.dr{width:98%}.ds{width:81%}.dt{margin-left:8px}.du{color:#6B6B6B}.dv{font-size:13px}.dw{height:100%}.ep{color:#FFFFFF}.eq{fill:#FFFFFF}.er{background:#1A8917}.es{border-color:#1A8917}.ew:disabled{cursor:inherit !important}.ex:disabled{opacity:0.3}.ey:disabled:hover{background:#1A8917}.ez:disabled:hover{border-color:#1A8917}.fa{border-radius:99em}.fb{border-width:1px}.fc{border-style:solid}.fd{box-sizing:border-box}.fe{text-decoration:none}.ff{text-align:center}.fi{margin-right:32px}.fj{position:relative}.fk{fill:#6B6B6B}.fn{background:transparent}.fo svg{margin-left:4px}.fp svg{fill:#6B6B6B}.fr{box-shadow:inset 0 0 0 1px rgba(0, 0, 0, 0.05)}.fs{position:absolute}.fz{margin:0 24px}.gd{background:rgba(255, 255, 255, 1)}.ge{border:1px solid #F2F2F2}.gf{box-shadow:0 1px 4px #F2F2F2}.gg{max-height:100vh}.gh{overflow-y:auto}.gi{left:0}.gj{top:calc(100vh + 100px)}.gk{bottom:calc(100vh + 100px)}.gl{width:10px}.gm{pointer-events:none}.gn{word-break:break-word}.go{word-wrap:break-word}.gp:after{display:block}.gq:after{content:""}.gr:after{clear:both}.gs{line-height:1.23}.gt{letter-spacing:0}.gu{font-style:normal}.gv{font-weight:700}.ia{align-items:baseline}.ib{width:48px}.ic{height:48px}.id{border:2px solid rgba(255, 255, 255, 1)}.ie{z-index:0}.if{box-shadow:none}.ig{border:1px solid rgba(0, 0, 0, 0.05)}.ih{margin-bottom:2px}.ii{flex-wrap:nowrap}.ij{font-size:16px}.ik{line-height:24px}.im{margin:0 8px}.in{display:inline}.io{color:#1A8917}.ip{fill:#1A8917}.is{flex:0 0 auto}.iv{flex-wrap:wrap}.iw{padding-left:8px}.ix{padding-right:8px}.jy> *{flex-shrink:0}.jz{overflow-x:scroll}.ka::-webkit-scrollbar{display:none}.kb{scrollbar-width:none}.kc{-ms-overflow-style:none}.kd{width:74px}.ke{flex-direction:row}.kf{z-index:2}.kg{margin-right:4px}.kj{-webkit-user-select:none}.kk{border:0}.kl{fill:rgba(117, 117, 117, 1)}.ko{outline:0}.kp{user-select:none}.kq> svg{pointer-events:none}.kz{cursor:progress}.la{opacity:1}.lb{padding:4px 0}.le{margin-top:0px}.lf{width:16px}.lh{display:inline-flex}.ln{max-width:100%}.lo{padding:8px 2px}.lp svg{color:#6B6B6B}.mg{line-height:1.58}.mh{letter-spacing:-0.004em}.mi{font-family:source-serif-pro, Georgia, Cambria, "Times New Roman", Times, serif}.nd{margin-bottom:-0.46em}.ne{line-height:1.18}.nf{letter-spacing:-0.022em}.ng{font-weight:600}.nw{margin-bottom:-0.31em}.oc{line-height:1.12}.ov{margin-bottom:-0.28em}.pb{overflow-x:auto}.pc{font-family:source-code-pro, Menlo, Monaco, "Courier New", Courier, monospace}.pd{padding:32px}.pe{border:1px solid #E5E5E5}.pf{line-height:1.4}.pg{margin-top:-0.2em}.ph{margin-bottom:-0.2em}.pi{white-space:pre}.pj{min-width:fit-content}.pk{margin-top:32px}.pl{margin-bottom:14px}.pm{padding-top:24px}.pn{padding-bottom:10px}.po{background-color:#000000}.pp{height:3px}.pq{width:3px}.pr{margin-right:20px}.ps{padding:2px 4px}.pt{font-size:75%}.pu> strong{font-family:inherit}.pv{text-decoration:underline}.pw{margin-bottom:26px}.px{margin-top:6px}.py{margin-top:8px}.pz{margin-right:8px}.qa{padding:8px 16px}.qb{border-radius:100px}.qc{transition:background 300ms ease}.qe{white-space:nowrap}.qf{border-top:none}.qg{height:52px}.qh{max-height:52px}.qi{box-sizing:content-box}.qj{position:static}.qk{z-index:1}.qm{max-width:155px}.qx{height:0px}.qy{margin-bottom:40px}.rn{height:64px}.ro{width:64px}.rp{align-self:flex-end}.rq{color:rgba(255, 255, 255, 1)}.rr{fill:rgba(255, 255, 255, 1)}.rs{background:rgba(25, 25, 25, 1)}.rt{border-color:rgba(25, 25, 25, 1)}.rw:disabled{opacity:0.1}.rx:disabled:hover{background:rgba(25, 25, 25, 1)}.ry:disabled:hover{border-color:rgba(25, 25, 25, 1)}.rz{flex:1 1 auto}.sf{padding-right:4px}.sg{font-weight:500}.sn{white-space:pre-wrap}.so{margin-top:16px}.sp{margin-bottom:54px}.sv{gap:18px}.sw{fill:rgba(61, 61, 61, 1)}.td{border-bottom:solid 1px #E5E5E5}.te{margin-top:72px}.tf{padding:24px 0}.tg{margin-bottom:0px}.th{margin-right:16px}.as:hover:not(:disabled){color:rgba(25, 25, 25, 1)}.at:hover:not(:disabled){fill:rgba(25, 25, 25, 1)}.et:hover{background:#156D12}.eu:hover{border-color:#156D12}.ev:hover{cursor:pointer}.fl:hover{color:#242424}.fm:hover{fill:#242424}.fq:hover svg{fill:#242424}.ft:hover{background-color:rgba(0, 0, 0, 0.1)}.il:hover{text-decoration:underline}.iq:hover:not(:disabled){color:#156D12}.ir:hover:not(:disabled){fill:#156D12}.kn:hover{fill:rgba(8, 8, 8, 1)}.lc:hover{fill:#000000}.ld:hover p{color:#000000}.lg:hover{color:#000000}.lq:hover svg{color:#000000}.qd:hover{background-color:#F2F2F2}.rm:hover{background-color:none}.ru:hover{background:#000000}.rv:hover{border-color:#242424}.sx:hover{fill:rgba(25, 25, 25, 1)}.bd:focus-within path{fill:#242424}.km:focus{fill:rgba(8, 8, 8, 1)}.lr:focus svg{color:#000000}.kr:active{border-style:none}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE" media="all and (min-width: 1080px)">.d{display:none}.bw{width:64px}.cg{margin:0 64px}.cv{height:48px}.dc{margin-bottom:52px}.do{margin-bottom:48px}.ef{font-size:14px}.eg{line-height:20px}.em{font-size:13px}.eo{padding:5px 12px}.fh{display:flex}.fy{margin-bottom:68px}.gc{max-width:680px}.hq{font-size:42px}.hr{margin-top:1.19em}.hs{margin-bottom:32px}.ht{line-height:52px}.hu{letter-spacing:-0.011em}.hz{align-items:center}.jk{border-top:solid 1px #F2F2F2}.jl{border-bottom:solid 1px #F2F2F2}.jm{margin:32px 0 0}.jn{padding:3px 8px}.jw> *{margin-right:24px}.jx> :last-child{margin-right:0}.ky{margin-top:0px}.lm{margin:0}.mz{font-size:20px}.na{margin-top:2.14em}.nb{line-height:32px}.nc{letter-spacing:-0.003em}.nt{margin-top:1.72em}.nu{line-height:24px}.nv{letter-spacing:0}.ob{margin-top:0.94em}.or{font-size:24px}.os{margin-top:1.95em}.ot{line-height:30px}.ou{letter-spacing:-0.016em}.pa{margin-top:56px}.qr{display:inline-block}.qw{margin-bottom:104px}.qz{flex-direction:row}.rc{margin-bottom:0}.rd{margin-right:20px}.sa{max-width:500px}.su{margin-bottom:72px}.tc{padding-top:72px}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE" media="all and (max-width: 1079.98px)">.e{display:none}.kx{margin-top:0px}.qq{display:inline-block}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE" media="all and (max-width: 903.98px)">.f{display:none}.kw{margin-top:0px}.qp{display:inline-block}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE" media="all and (max-width: 727.98px)">.g{display:none}.ku{margin-top:0px}.kv{margin-right:0px}.qo{display:inline-block}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE" media="all and (max-width: 551.98px)">.h{display:none}.s{display:flex}.t{justify-content:space-between}.bs{width:24px}.cc{margin:0 24px}.cr{height:40px}.cy{margin-bottom:44px}.dk{margin-bottom:32px}.dx{font-size:13px}.dy{line-height:20px}.eh{padding:0px 8px 1px}.fu{margin-bottom:4px}.gw{font-size:32px}.gx{margin-top:1.01em}.gy{margin-bottom:24px}.gz{line-height:38px}.ha{letter-spacing:-0.014em}.hv{align-items:flex-start}.it{flex-direction:column}.iy{margin:24px -24px 0}.iz{padding:0}.jo> *{margin-right:8px}.jp> :last-child{margin-right:24px}.kh{margin-left:0px}.ks{margin-top:0px}.kt{margin-right:0px}.li{margin:0}.ls{border:1px solid #F2F2F2}.lt{border-radius:99em}.lu{padding:0px 16px 0px 12px}.lv{height:38px}.lw{align-items:center}.ly svg{margin-right:8px}.mj{font-size:18px}.mk{margin-top:1.56em}.ml{line-height:28px}.mm{letter-spacing:-0.003em}.nh{font-size:16px}.ni{margin-top:1.23em}.nj{letter-spacing:0}.nx{margin-top:0.67em}.od{font-size:20px}.oe{margin-top:1.2em}.of{line-height:24px}.ow{margin-top:40px}.qn{display:inline-block}.qs{margin-bottom:96px}.rk{margin-bottom:20px}.rl{margin-right:0}.se{max-width:100%}.sh{font-size:24px}.si{line-height:30px}.sj{letter-spacing:-0.016em}.sq{margin-bottom:64px}.sy{padding-top:48px}.lx:hover{border-color:#E5E5E5}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE" media="all and (min-width: 904px) and (max-width: 1079.98px)">.i{display:none}.bv{width:64px}.cf{margin:0 64px}.cu{height:48px}.db{margin-bottom:52px}.dn{margin-bottom:48px}.ed{font-size:14px}.ee{line-height:20px}.ek{font-size:13px}.el{padding:5px 12px}.fg{display:flex}.fx{margin-bottom:68px}.gb{max-width:680px}.hl{font-size:42px}.hm{margin-top:1.19em}.hn{margin-bottom:32px}.ho{line-height:52px}.hp{letter-spacing:-0.011em}.hy{align-items:center}.jg{border-top:solid 1px #F2F2F2}.jh{border-bottom:solid 1px #F2F2F2}.ji{margin:32px 0 0}.jj{padding:3px 8px}.ju> *{margin-right:24px}.jv> :last-child{margin-right:0}.ll{margin:0}.mv{font-size:20px}.mw{margin-top:2.14em}.mx{line-height:32px}.my{letter-spacing:-0.003em}.nq{margin-top:1.72em}.nr{line-height:24px}.ns{letter-spacing:0}.oa{margin-top:0.94em}.on{font-size:24px}.oo{margin-top:1.95em}.op{line-height:30px}.oq{letter-spacing:-0.016em}.oz{margin-top:56px}.qv{margin-bottom:104px}.ra{flex-direction:row}.re{margin-bottom:0}.rf{margin-right:20px}.sb{max-width:500px}.st{margin-bottom:72px}.tb{padding-top:72px}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE" media="all and (min-width: 728px) and (max-width: 903.98px)">.j{display:none}.w{display:flex}.x{justify-content:space-between}.bu{width:64px}.ce{margin:0 48px}.ct{height:48px}.da{margin-bottom:52px}.dm{margin-bottom:48px}.eb{font-size:13px}.ec{line-height:20px}.ej{padding:0px 8px 1px}.fw{margin-bottom:68px}.ga{max-width:680px}.hg{font-size:42px}.hh{margin-top:1.19em}.hi{margin-bottom:32px}.hj{line-height:52px}.hk{letter-spacing:-0.011em}.hx{align-items:center}.jc{border-top:solid 1px #F2F2F2}.jd{border-bottom:solid 1px #F2F2F2}.je{margin:32px 0 0}.jf{padding:3px 8px}.js> *{margin-right:24px}.jt> :last-child{margin-right:0}.lk{margin:0}.mr{font-size:20px}.ms{margin-top:2.14em}.mt{line-height:32px}.mu{letter-spacing:-0.003em}.nn{margin-top:1.72em}.no{line-height:24px}.np{letter-spacing:0}.nz{margin-top:0.94em}.oj{font-size:24px}.ok{margin-top:1.95em}.ol{line-height:30px}.om{letter-spacing:-0.016em}.oy{margin-top:56px}.qu{margin-bottom:104px}.rb{flex-direction:row}.rg{margin-bottom:0}.rh{margin-right:20px}.sc{max-width:500px}.ss{margin-bottom:72px}.ta{padding-top:72px}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE" media="all and (min-width: 552px) and (max-width: 727.98px)">.k{display:none}.u{display:flex}.v{justify-content:space-between}.bt{width:24px}.cd{margin:0 24px}.cs{height:40px}.cz{margin-bottom:44px}.dl{margin-bottom:32px}.dz{font-size:13px}.ea{line-height:20px}.ei{padding:0px 8px 1px}.fv{margin-bottom:4px}.hb{font-size:32px}.hc{margin-top:1.01em}.hd{margin-bottom:24px}.he{line-height:38px}.hf{letter-spacing:-0.014em}.hw{align-items:flex-start}.iu{flex-direction:column}.ja{margin:24px 0 0}.jb{padding:0}.jq> *{margin-right:8px}.jr> :last-child{margin-right:8px}.ki{margin-left:0px}.lj{margin:0}.lz{border:1px solid #F2F2F2}.ma{border-radius:99em}.mb{padding:0px 16px 0px 12px}.mc{height:38px}.md{align-items:center}.mf svg{margin-right:8px}.mn{font-size:18px}.mo{margin-top:1.56em}.mp{line-height:28px}.mq{letter-spacing:-0.003em}.nk{font-size:16px}.nl{margin-top:1.23em}.nm{letter-spacing:0}.ny{margin-top:0.67em}.og{font-size:20px}.oh{margin-top:1.2em}.oi{line-height:24px}.ox{margin-top:40px}.qt{margin-bottom:96px}.ri{margin-bottom:20px}.rj{margin-right:0}.sd{max-width:100%}.sk{font-size:24px}.sl{line-height:30px}.sm{letter-spacing:-0.016em}.sr{margin-bottom:64px}.sz{padding-top:48px}.me:hover{border-color:#E5E5E5}</style><style type="text/css" data-fela-rehydration="527" data-fela-type="RULE" media="print">.ql{display:none}</style></head><body><div id="root"><div class="a b c"><div class="d e f g h i j k"></div><script>document.domain = document.domain;</script><div class="l c"><div class="l m n o c"><div class="p q r s t u v w x i d y z"><a class="du ag dv bf ak b am an ao ap aq ar as at s u w i d q dw z" href="https://rsci.app.link/?%24canonical_url=https%3A%2F%2Fmedium.com%2Fp%2Fd56903b167c5&%7Efeature=LoOpenInAppButton&%7Echannel=ShowPostUnderUser&source=---top_nav_layout_nav----------------------------------" rel="noopener follow">Open in app<svg xmlns="http://www.w3.org/2000/svg" width="10" height="10" fill="none" viewBox="0 0 10 10" class="dt"><path fill="currentColor" d="M.985 8.485a.375.375 0 1 0 .53.53zM8.75 1.25h.375A.375.375 0 0 0 8.75.875zM8.375 6.5a.375.375 0 1 0 .75 0zM3.5.875a.375.375 0 1 0 0 .75zm-1.985 8.14 7.5-7.5-.53-.53-7.5 7.5zm6.86-7.765V6.5h.75V1.25zM3.5 1.625h5.25v-.75H3.5z"></path></svg></a><div class="ab q"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><button class="bf b dx dy eh dz ea ei eb ec ej ek ee el em eg eo ep eq er es et eu ev ew ex ey ez fa fb fc fd bm fe ff" data-testid="headerSignUpButton">Sign up</button></span></p><div class="ax l"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerSignInButton" rel="noopener follow" href="/m/signin?operation=login&redirect=https%3A%2F%2Fmedium.com%2F%40ayoubkirouane3%2Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5&source=post_page---top_nav_layout_nav-----------------------global_nav-----------">Sign in</a></span></p></div></div></div><div class="p q r ab ac"><div class="ab q ae"><a class="af ag ah ai aj ak al am an ao ap aq ar as at ab" aria-label="Homepage" data-testid="headerMediumLogo" rel="noopener follow" href="/?source=---top_nav_layout_nav----------------------------------"><svg xmlns="http://www.w3.org/2000/svg" width="719" height="160" fill="none" viewBox="0 0 719 160" class="au av aw"><path fill="#242424" d="m174.104 9.734.215-.047V8.02H130.39L89.6 103.89 48.81 8.021H1.472v1.666l.212.047c8.018 1.81 12.09 4.509 12.09 14.242V137.93c0 9.734-4.087 12.433-12.106 14.243l-.212.047v1.671h32.118v-1.665l-.213-.048c-8.018-1.809-12.089-4.509-12.089-14.242V30.586l52.399 123.305h2.972l53.925-126.743V140.75c-.687 7.688-4.721 10.062-11.982 11.701l-.215.05v1.652h55.948v-1.652l-.215-.05c-7.269-1.639-11.4-4.013-12.087-11.701l-.037-116.774h.037c0-9.733 4.071-12.432 12.087-14.242m25.555 75.488c.915-20.474 8.268-35.252 20.606-35.507 3.806.063 6.998 1.312 9.479 3.714 5.272 5.118 7.751 15.812 7.368 31.793zm-.553 5.77h65.573v-.275c-.186-15.656-4.721-27.834-13.466-36.196-7.559-7.227-18.751-11.203-30.507-11.203h-.263c-6.101 0-13.584 1.48-18.909 4.16-6.061 2.807-11.407 7.003-15.855 12.511-7.161 8.874-11.499 20.866-12.554 34.343q-.05.606-.092 1.212a50 50 0 0 0-.065 1.151 85.807 85.807 0 0 0-.094 5.689c.71 30.524 17.198 54.917 46.483 54.917 25.705 0 40.675-18.791 44.407-44.013l-1.886-.664c-6.557 13.556-18.334 21.771-31.738 20.769-18.297-1.369-32.314-19.922-31.042-42.395m139.722 41.359c-2.151 5.101-6.639 7.908-12.653 7.908s-11.513-4.129-15.418-11.63c-4.197-8.053-6.405-19.436-6.405-32.92 0-28.067 8.729-46.22 22.24-46.22 5.657 0 10.111 2.807 12.236 7.704zm43.499 20.008c-8.019-1.897-12.089-4.722-12.089-14.951V1.309l-48.716 14.353v1.757l.299-.024c6.72-.543 11.278.386 13.925 2.83 2.072 1.915 3.082 4.853 3.082 8.987v18.66c-4.803-3.067-10.516-4.56-17.448-4.56-14.059 0-26.909 5.92-36.176 16.672-9.66 11.205-14.767 26.518-14.767 44.278-.003 31.72 15.612 53.039 38.851 53.039 13.595 0 24.533-7.449 29.54-20.013v16.865h43.711v-1.746zM424.1 19.819c0-9.904-7.468-17.374-17.375-17.374-9.859 0-17.573 7.632-17.573 17.374s7.721 17.374 17.573 17.374c9.907 0 17.375-7.47 17.375-17.374m11.499 132.546c-8.019-1.897-12.089-4.722-12.089-14.951h-.035V43.635l-43.714 12.551v1.705l.263.024c9.458.842 12.047 4.1 12.047 15.152v81.086h43.751v-1.746zm112.013 0c-8.018-1.897-12.089-4.722-12.089-14.951V43.635l-41.621 12.137v1.71l.246.026c7.733.813 9.967 4.257 9.967 15.36v59.279c-2.578 5.102-7.415 8.131-13.274 8.336-9.503 0-14.736-6.419-14.736-18.073V43.638l-43.714 12.55v1.703l.262.024c9.459.84 12.05 4.097 12.05 15.152v50.17a56.3 56.3 0 0 0 .91 10.444l.787 3.423c3.701 13.262 13.398 20.197 28.59 20.197 12.868 0 24.147-7.966 29.115-20.43v17.311h43.714v-1.747zm169.818 1.788v-1.749l-.213-.05c-8.7-2.006-12.089-5.789-12.089-13.49v-63.79c0-19.89-11.171-31.761-29.883-31.761-13.64 0-25.141 7.882-29.569 20.16-3.517-13.01-13.639-20.16-28.606-20.16-13.146 0-23.449 6.938-27.869 18.657V43.643L545.487 55.68v1.715l.263.024c9.345.829 12.047 4.181 12.047 14.95v81.784h40.787v-1.746l-.215-.053c-6.941-1.631-9.181-4.606-9.181-12.239V66.998c1.836-4.289 5.537-9.37 12.853-9.37 9.086 0 13.692 6.296 13.692 18.697v77.828h40.797v-1.746l-.215-.053c-6.94-1.631-9.18-4.606-9.18-12.239V75.066a42 42 0 0 0-.578-7.26c1.947-4.661 5.86-10.177 13.475-10.177 9.214 0 13.691 6.114 13.691 18.696v77.828z"></path></svg></a><div class="ax h"><div class="ab ay az ba bb q bc bd"><div class="bm" aria-hidden="false" aria-describedby="searchResults" aria-labelledby="searchResults"></div><div class="bn bo ab"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M4.092 11.06a6.95 6.95 0 1 1 13.9 0 6.95 6.95 0 0 1-13.9 0m6.95-8.05a8.05 8.05 0 1 0 5.13 14.26l3.75 3.75a.56.56 0 1 0 .79-.79l-3.73-3.73A8.05 8.05 0 0 0 11.042 3z" clip-rule="evenodd"></path></svg></div><input role="combobox" aria-controls="searchResults" aria-expanded="false" aria-label="search" data-testid="headerSearchInput" tabindex="0" class="ay be bf bg z bh bi bj bk bl" placeholder="Search" value=""/></div></div></div><div class="h k w fg fh"><div class="fi ab"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerWriteButton" rel="noopener follow" href="/m/signin?operation=register&redirect=https%3A%2F%2Fmedium.com%2Fnew-story&source=---top_nav_layout_nav-----------------------new_post_topnav-----------"><div class="bf b bg z du fj fk ab q fl fm"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24" aria-label="Write"><path fill="currentColor" d="M14 4a.5.5 0 0 0 0-1zm7 6a.5.5 0 0 0-1 0zm-7-7H4v1h10zM3 4v16h1V4zm1 17h16v-1H4zm17-1V10h-1v10zm-1 1a1 1 0 0 0 1-1h-1zM3 20a1 1 0 0 0 1 1v-1zM4 3a1 1 0 0 0-1 1h1z"></path><path stroke="currentColor" d="m17.5 4.5-8.458 8.458a.25.25 0 0 0-.06.098l-.824 2.47a.25.25 0 0 0 .316.316l2.47-.823a.25.25 0 0 0 .098-.06L19.5 6.5m-2-2 2.323-2.323a.25.25 0 0 1 .354 0l1.646 1.646a.25.25 0 0 1 0 .354L19.5 6.5m-2-2 2 2"></path></svg><div class="dt l">Write</div></div></a></span></div></div><div class="k j i d"><div class="fi ab"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerSearchButton" rel="noopener follow" href="/search?source=---top_nav_layout_nav----------------------------------"><div class="bf b bg z du fj fk ab q fl fm"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24" aria-label="Search"><path fill="currentColor" fill-rule="evenodd" d="M4.092 11.06a6.95 6.95 0 1 1 13.9 0 6.95 6.95 0 0 1-13.9 0m6.95-8.05a8.05 8.05 0 1 0 5.13 14.26l3.75 3.75a.56.56 0 1 0 .79-.79l-3.73-3.73A8.05 8.05 0 0 0 11.042 3z" clip-rule="evenodd"></path></svg></div></a></div></div><div class="fi h k j"><div class="ab q"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><button class="bf b dx dy eh dz ea ei eb ec ej ek ee el em eg eo ep eq er es et eu ev ew ex ey ez fa fb fc fd bm fe ff" data-testid="headerSignUpButton">Sign up</button></span></p><div class="ax l"><p class="bf b dx dy dz ea eb ec ed ee ef eg du"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerSignInButton" rel="noopener follow" href="/m/signin?operation=login&redirect=https%3A%2F%2Fmedium.com%2F%40ayoubkirouane3%2Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5&source=post_page---top_nav_layout_nav-----------------------global_nav-----------">Sign in</a></span></p></div></div></div><div class="l" aria-hidden="false"><button class="ay fn am ab q ao fo fp fq" aria-label="user options menu" data-testid="headerUserIcon"><div class="l fj"><img alt="" class="l fd by bz ca cx" src="https://miro.medium.com/v2/resize:fill:64:64/1*dmbNkD5D-u45r44go_cf0g.png" width="32" height="32" loading="lazy" role="presentation"/><div class="fr by l bz ca fs n ay ft"></div></div></button></div></div></div><div class="l"><div class="fu fv fw fx fy l"><div class="ab cb"><div class="ci bh fz ga gb gc"></div></div><article><div class="l"><div class="l"><span class="l"></span><section><div><div class="fs gi gj gk gl gm"></div><div class="gn go gp gq gr"><div class="ab cb"><div class="ci bh fz ga gb gc"><div><h1 id="fe01" class="pw-post-title gs gt gu bf gv gw gx gy gz ha hb hc hd he hf hg hh hi hj hk hl hm hn ho hp hq hr hs ht hu bk" data-testid="storyTitle">Simple Chunking Strategies for RAG Applications (Part 1)</h1><div><div class="speechify-ignore ab cp"><div class="speechify-ignore bh l"><div class="hv hw hx hy hz ab"><div><div class="ab ia"><div><div class="bm" aria-hidden="false"><a rel="noopener follow" href="/@ayoubkirouane3?source=post_page---byline--d56903b167c5--------------------------------"><div class="l ib ic by id ie"><div class="l fj"><img alt="kirouane Ayoub" class="l fd by dd de cx" src="https://miro.medium.com/v2/resize:fill:88:88/1*T-KWhmfASlLM3XMvRKZnWA.jpeg" width="44" height="44" loading="lazy" data-testid="authorPhoto"/><div class="if by l dd de fs n ig ft"></div></div></div></a></div></div></div></div><div class="bn bh l"><div class="ab"><div style="flex:1"><span class="bf b bg z bk"><div class="ih ab q"><div class="ab q ii"><div class="ab q"><div><div class="bm" aria-hidden="false"><p class="bf b ij ik bk"><a class="af ag ah ai aj ak al am an ao ap aq ar il" data-testid="authorName" rel="noopener follow" href="/@ayoubkirouane3?source=post_page---byline--d56903b167c5--------------------------------">kirouane Ayoub</a></p></div></div></div><span class="im in" aria-hidden="true"><span class="bf b bg z du">·</span></span><p class="bf b ij ik du"><span><a class="io ip ah ai aj ak al am an ao ap aq ar ex iq ir" rel="noopener follow" href="/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fsubscribe%2Fuser%2F4751fd7878c5&operation=register&redirect=https%3A%2F%2Fmedium.com%2F%40ayoubkirouane3%2Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5&user=kirouane+Ayoub&userId=4751fd7878c5&source=post_page-4751fd7878c5--byline--d56903b167c5---------------------post_header-----------">Follow</a></span></p></div></div></span></div></div><div class="l is"><span class="bf b bg z du"><div class="ab cn it iu iv"><span class="bf b bg z du"><div class="ab ae"><span data-testid="storyReadTime">5 min read</span><div class="iw ix l" aria-hidden="true"><span class="l" aria-hidden="true"><span class="bf b bg z du">·</span></span></div><span data-testid="storyPublishDate">Aug 27, 2024</span></div></span></div></span></div></div></div><div class="ab cp iy iz ja jb jc jd je jf jg jh ji jj jk jl jm jn"><div class="h k w fg fh q"><div class="kd l"><div class="ab q ke kf"><div class="pw-multi-vote-icon fj kg kh ki kj"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerClapButton" rel="noopener follow" href="/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fvote%2Fp%2Fd56903b167c5&operation=register&redirect=https%3A%2F%2Fmedium.com%2F%40ayoubkirouane3%2Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5&user=kirouane+Ayoub&userId=4751fd7878c5&source=---header_actions--d56903b167c5---------------------clap_footer-----------"><div><div class="bm" aria-hidden="false"><div class="kk ao kl km kn ko am kp kq kr kj"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" aria-label="clap"><path fill-rule="evenodd" d="M11.37.828 12 3.282l.63-2.454zM13.916 3.953l1.523-2.112-1.184-.39zM8.589 1.84l1.522 2.112-.337-2.501zM18.523 18.92c-.86.86-1.75 1.246-2.62 1.33a6 6 0 0 0 .407-.372c2.388-2.389 2.86-4.951 1.399-7.623l-.912-1.603-.79-1.672c-.26-.56-.194-.98.203-1.288a.7.7 0 0 1 .546-.132c.283.046.546.231.728.5l2.363 4.157c.976 1.624 1.141 4.237-1.324 6.702m-10.999-.438L3.37 14.328a.828.828 0 0 1 .585-1.408.83.83 0 0 1 .585.242l2.158 2.157a.365.365 0 0 0 .516-.516l-2.157-2.158-1.449-1.449a.826.826 0 0 1 1.167-1.17l3.438 3.44a.363.363 0 0 0 .516 0 .364.364 0 0 0 0-.516L5.293 9.513l-.97-.97a.826.826 0 0 1 0-1.166.84.84 0 0 1 1.167 0l.97.968 3.437 3.436a.36.36 0 0 0 .517 0 .366.366 0 0 0 0-.516L6.977 7.83a.82.82 0 0 1-.241-.584.82.82 0 0 1 .824-.826c.219 0 .43.087.584.242l5.787 5.787a.366.366 0 0 0 .587-.415l-1.117-2.363c-.26-.56-.194-.98.204-1.289a.7.7 0 0 1 .546-.132c.283.046.545.232.727.501l2.193 3.86c1.302 2.38.883 4.59-1.277 6.75-1.156 1.156-2.602 1.627-4.19 1.367-1.418-.236-2.866-1.033-4.079-2.246M10.75 5.971l2.12 2.12c-.41.502-.465 1.17-.128 1.89l.22.465-3.523-3.523a.8.8 0 0 1-.097-.368c0-.22.086-.428.241-.584a.847.847 0 0 1 1.167 0m7.355 1.705c-.31-.461-.746-.758-1.23-.837a1.44 1.44 0 0 0-1.11.275c-.312.24-.505.543-.59.881a1.74 1.74 0 0 0-.906-.465 1.47 1.47 0 0 0-.82.106l-2.182-2.182a1.56 1.56 0 0 0-2.2 0 1.54 1.54 0 0 0-.396.701 1.56 1.56 0 0 0-2.21-.01 1.55 1.55 0 0 0-.416.753c-.624-.624-1.649-.624-2.237-.037a1.557 1.557 0 0 0 0 2.2c-.239.1-.501.238-.715.453a1.56 1.56 0 0 0 0 2.2l.516.515a1.556 1.556 0 0 0-.753 2.615L7.01 19c1.32 1.319 2.909 2.189 4.475 2.449q.482.08.971.08c.85 0 1.653-.198 2.393-.579.231.033.46.054.686.054 1.266 0 2.457-.52 3.505-1.567 2.763-2.763 2.552-5.734 1.439-7.586z" clip-rule="evenodd"></path></svg></div></div></div></a></span></div><div class="pw-multi-vote-count l ks kt ku kv kw kx ky"><p class="bf b dv z du"><span class="kz">--</span></p></div></div></div><div><div class="bm" aria-hidden="false"><button class="ao kk la lb ab q fk lc ld" aria-label="responses"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" class="le"><path d="M18.006 16.803c1.533-1.456 2.234-3.325 2.234-5.321C20.24 7.357 16.709 4 12.191 4S4 7.357 4 11.482c0 4.126 3.674 7.482 8.191 7.482.817 0 1.622-.111 2.393-.327.231.2.48.391.744.559 1.06.693 2.203 1.044 3.399 1.044.224-.008.4-.112.486-.287a.49.49 0 0 0-.042-.518c-.495-.67-.845-1.364-1.04-2.057a4 4 0 0 1-.125-.598zm-3.122 1.055-.067-.223-.315.096a8 8 0 0 1-2.311.338c-4.023 0-7.292-2.955-7.292-6.587 0-3.633 3.269-6.588 7.292-6.588 4.014 0 7.112 2.958 7.112 6.593 0 1.794-.608 3.469-2.027 4.72l-.195.168v.255c0 .056 0 .151.016.295.025.231.081.478.154.733.154.558.398 1.117.722 1.659a5.3 5.3 0 0 1-2.165-.845c-.276-.176-.714-.383-.941-.59z"></path></svg></button></div></div></div><div class="ab q jo jp jq jr js jt ju jv jw jx jy jz ka kb kc"><div class="lf k j i d"></div><div class="h k"><div><div class="bm" aria-hidden="false"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="headerBookmarkButton" rel="noopener follow" href="/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fbookmark%2Fp%2Fd56903b167c5&operation=register&redirect=https%3A%2F%2Fmedium.com%2F%40ayoubkirouane3%2Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5&source=---header_actions--d56903b167c5---------------------bookmark_footer-----------"><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25" class="du lg" aria-label="Add to list bookmark button"><path fill="currentColor" d="M18 2.5a.5.5 0 0 1 1 0V5h2.5a.5.5 0 0 1 0 1H19v2.5a.5.5 0 1 1-1 0V6h-2.5a.5.5 0 0 1 0-1H18zM7 7a1 1 0 0 1 1-1h3.5a.5.5 0 0 0 0-1H8a2 2 0 0 0-2 2v14a.5.5 0 0 0 .805.396L12.5 17l5.695 4.396A.5.5 0 0 0 19 21v-8.5a.5.5 0 0 0-1 0v7.485l-5.195-4.012a.5.5 0 0 0-.61 0L7 19.985z"></path></svg></a></span></div></div></div><div class="fd lh cn"><div class="l ae"><div class="ab cb"><div class="li lj lk ll lm ln ci bh"><div class="ab"><div class="bm bh" aria-hidden="false"><div><div class="bm" aria-hidden="false"><button aria-label="Listen" data-testid="audioPlayButton" class="af fk ah ai aj ak al lo an ao ap ex lp lq ld lr ls lt lu lv s lw lx ly lz ma mb mc u md me mf"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M3 12a9 9 0 1 1 18 0 9 9 0 0 1-18 0m9-10C6.477 2 2 6.477 2 12s4.477 10 10 10 10-4.477 10-10S17.523 2 12 2m3.376 10.416-4.599 3.066a.5.5 0 0 1-.777-.416V8.934a.5.5 0 0 1 .777-.416l4.599 3.066a.5.5 0 0 1 0 .832" clip-rule="evenodd"></path></svg><div class="j i d"><p class="bf b bg z du">Listen</p></div></button></div></div></div></div></div></div></div></div><div class="bm" aria-hidden="false" aria-describedby="postFooterSocialMenu" aria-labelledby="postFooterSocialMenu"><div><div class="bm" aria-hidden="false"><button aria-controls="postFooterSocialMenu" aria-expanded="false" aria-label="Share Post" data-testid="headerSocialShareButton" class="af fk ah ai aj ak al lo an ao ap ex lp lq ld lr ls lt lu lv s lw lx ly lz ma mb mc u md me mf"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M15.218 4.931a.4.4 0 0 1-.118.132l.012.006a.45.45 0 0 1-.292.074.5.5 0 0 1-.3-.13l-2.02-2.02v7.07c0 .28-.23.5-.5.5s-.5-.22-.5-.5v-7.04l-2 2a.45.45 0 0 1-.57.04h-.02a.4.4 0 0 1-.16-.3.4.4 0 0 1 .1-.32l2.8-2.8a.5.5 0 0 1 .7 0l2.8 2.79a.42.42 0 0 1 .068.498m-.106.138.008.004v-.01zM16 7.063h1.5a2 2 0 0 1 2 2v10a2 2 0 0 1-2 2h-11c-1.1 0-2-.9-2-2v-10a2 2 0 0 1 2-2H8a.5.5 0 0 1 .35.15.5.5 0 0 1 .15.35.5.5 0 0 1-.15.35.5.5 0 0 1-.35.15H6.4c-.5 0-.9.4-.9.9v10.2a.9.9 0 0 0 .9.9h11.2c.5 0 .9-.4.9-.9v-10.2c0-.5-.4-.9-.9-.9H16a.5.5 0 0 1 0-1" clip-rule="evenodd"></path></svg><div class="j i d"><p class="bf b bg z du">Share</p></div></button></div></div></div></div></div></div></div></div></div><p id="82c9" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk">When building a RAG (Retrieval-Augmented Generation) system, <strong class="mi gv">chunking</strong> text into manageable segments is a crucial step. Chunking not only ensures that content is well-organized but also improves the relevance and efficiency of search results. While many chunking techniques exist, this post will focus on basic strategies implemented using Langchain and Llama-Index. This is the first part of a series where we will explore these strategies.</p><h2 id="4a41" class="ne nf gu bf ng nh ni dy nj nk nl ea nm mr nn no np mv nq nr ns mz nt nu nv nw bk">Chunking Considerations</h2><p id="14a4" class="pw-post-body-paragraph mg mh gu mi b mj nx ml mm mn ny mp mq mr nz mt mu mv oa mx my mz ob nb nc nd gn bk">Before diving into the methods, it’s essential to consider:</p><p id="e1a5" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Chunk Size</strong> The size of each chunk should strike a balance between maintaining enough context for meaningful analysis and avoiding excessively large chunks that could affect focus. Smaller chunks (e.g., 256 to 512 tokens) are suited for detailed, granular tasks, whereas larger chunks may be better for understanding broader themes.</p><p id="185a" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Chunk Overlap</strong> An overlap of 100–200 tokens is generally effective. This overlap helps maintain continuity and context between chunks, ensuring that segmentation does not disrupt the flow and coherence of the text.</p><p id="03b8" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Model Compatibility</strong> The chunk size should align with the processing capabilities of the underlying language models. Some models handle larger chunks effectively, while others might be optimized for shorter chunks, suitable for sentence-level embeddings. Ensure that your chunk size is compatible with the model’s requirements to optimize performance.</p><p id="c11a" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Task Specificity:</strong> The nature of your task significantly impacts the optimal chunking strategy. For tasks involving precise information retrieval, smaller, more focused chunks can enhance retrieval accuracy. Conversely, tasks requiring complex reasoning or broader context might benefit from larger chunks that capture more comprehensive information.</p><p id="a7b6" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">System Constraints:</strong> If the chunked content needs to be processed by another system with token limitations or other constraints, you must adjust chunk sizes to fit within those boundaries. Ensure that the chunks do not exceed the maximum token limits of any integrated systems or APIs to avoid processing issues.</p><h2 id="14d9" class="ne nf gu bf ng nh ni dy nj nk nl ea nm mr nn no np mv nq nr ns mz nt nu nv nw bk">The Problem We Want to Solve</h2><p id="a8fe" class="pw-post-body-paragraph mg mh gu mi b mj nx ml mm mn ny mp mq mr nz mt mu mv oa mx my mz ob nb nc nd gn bk">In a RAG application, retrieving relevant information from a vast amount of data efficiently is paramount. Incorrect chunking can result in either losing important context or including too much noise, leading to poor search results. The goal is to find a chunking strategy that balances precision with context retention, optimizing both the embedding process and retrieval quality.</p><h1 id="02da" class="oc nf gu bf ng od oe of nj og oh oi nm oj ok ol om on oo op oq or os ot ou ov bk">Simple chunking methods</h1><h2 id="4faf" class="ne nf gu bf ng nh ni dy nj nk nl ea nm mr nn no np mv nq nr ns mz nt nu nv nw bk">1 / Character Splitting</h2><p id="623d" class="pw-post-body-paragraph mg mh gu mi b mj nx ml mm mn ny mp mq mr nz mt mu mv oa mx my mz ob nb nc nd gn bk">This method involves splitting text at fixed character intervals, possibly with some overlap, to ensure context is maintained across chunks. This is a simple yet effective approach for uniformly structured content.</p><p id="2a9a" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Using Langchain</strong></p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="0634" class="pf nf gu pc b bg pg ph l pi pj">text = """<br/>Character splitting is the most basic form of splitting up your text.<br/>It is the process of simply dividing your text into N-character sized chunks regardless of their content or form.<br/>"""<br/><br/>from langchain.text_splitter import CharacterTextSplitter<br/>text_splitter = CharacterTextSplitter(chunk_size = 35, chunk_overlap=10, separator=' ', strip_whitespace=False)<br/>documents = text_splitter.create_documents([text])<br/>print(documents)</span></pre><h2 id="9c01" class="ne nf gu bf ng nh ni dy nj nk nl ea nm mr nn no np mv nq nr ns mz nt nu nv nw bk">2 / Recursive Character Text Splitting</h2><p id="3fbc" class="pw-post-body-paragraph mg mh gu mi b mj nx ml mm mn ny mp mq mr nz mt mu mv oa mx my mz ob nb nc nd gn bk">Recursive chunking breaks down text hierarchically, using different separators, to create contextually relevant chunks.</p><p id="3c84" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Using Langchain</strong></p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="3cc3" class="pf nf gu pc b bg pg ph l pi pj">text = """<br/>This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ['\n\n', '\n', ' ', '']. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.<br/>"""<br/><br/>from langchain.text_splitter import RecursiveCharacterTextSplitter<br/><br/>text_splitter = RecursiveCharacterTextSplitter(chunk_size = 450,<br/> chunk_overlap=50)<br/>documents = text_splitter.create_documents([text])<br/>print(documents)</span></pre><p id="199e" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Using Llama-index</strong></p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="50db" class="pf nf gu pc b bg pg ph l pi pj">from langchain.text_splitter import RecursiveCharacterTextSplitter<br/>from llama_index.core.node_parser import LangchainNodeParser<br/>from llama_index.core import SimpleDirectoryReader<br/><br/>reader = SimpleDirectoryReader("data")<br/>documents = reader.load_data()<br/><br/>parser = LangchainNodeParser(RecursiveCharacterTextSplitter())<br/>nodes = parser.get_nodes_from_documents(documents)<br/>print(nodes)</span></pre><h2 id="940d" class="ne nf gu bf ng nh ni dy nj nk nl ea nm mr nn no np mv nq nr ns mz nt nu nv nw bk">3 / Sentence Splitting</h2><p id="5e3e" class="pw-post-body-paragraph mg mh gu mi b mj nx ml mm mn ny mp mq mr nz mt mu mv oa mx my mz ob nb nc nd gn bk">A Sentence Splitter breaks text into individual sentences, facilitating more precise text analysis and processing.</p><p id="7792" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Using Llama-index</strong></p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="3ab1" class="pf nf gu pc b bg pg ph l pi pj"># text = """<br/># This tool enhances tasks like information retrieval and text generation by treating each sentence as a distinct unit, ensuring context is maintained and understood correctly.<br/># """<br/># with open('data/text.txt', 'w') as f:<br/># f.write(text)<br/><br/>from llama_index.core.node_parser import SentenceSplitter<br/>from llama_index.core import SimpleDirectoryReader<br/><br/>reader = SimpleDirectoryReader("data")<br/>documents = reader.load_data()<br/><br/>splitter = SentenceSplitter(<br/> chunk_size=1024,<br/> chunk_overlap=20,<br/>)<br/>nodes = splitter.get_nodes_from_documents(documents)<br/>print(nodes)</span></pre></div></div></div><div class="ab cb pk pl pm pn" role="separator"><span class="po by bm pp pq pr"></span><span class="po by bm pp pq pr"></span><span class="po by bm pp pq"></span></div><div class="gn go gp gq gr"><div class="ab cb"><div class="ci bh fz ga gb gc"><h2 id="db1a" class="ne nf gu bf ng nh ni dy nj nk nl ea nm mr nn no np mv nq nr ns mz nt nu nv nw bk">4 / Document Specific Splitting</h2><p id="668a" class="pw-post-body-paragraph mg mh gu mi b mj nx ml mm mn ny mp mq mr nz mt mu mv oa mx my mz ob nb nc nd gn bk">A “structure-aware” chunker divides text based on its inherent structure, such as headings, lists, or sections, to preserve the content’s logical organization. This method ensures that chunks retain their meaningful context and coherence, which is particularly useful for structured documents like reports or manuals.</p><p id="29e8" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Using Langchain</strong></p><p id="011f" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk">MarkdownTextSplitter :</p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="683a" class="pf nf gu pc b bg pg ph l pi pj">with open("README.md") as f:<br/> markdown_text = f.read()<br/><br/>from langchain.text_splitter import MarkdownTextSplitter<br/><br/>splitter = MarkdownTextSplitter(chunk_size = 40, <br/> chunk_overlap=0)<br/><br/>documents = splitter.create_documents([markdown_text])<br/><br/>print(documents)</span></pre><p id="a0a6" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk">Python code Splitter :</p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="f66c" class="pf nf gu pc b bg pg ph l pi pj">python_text = """<br/>import numpy as np<br/>def mean_squared_error(y_true, y_pred):<br/> # Convert inputs to numpy arrays<br/> y_true = np.array(y_true)<br/> y_pred = np.array(y_pred)<br/><br/> # Compute the squared differences<br/> squared_differences = (y_true - y_pred) ** 2<br/><br/> # Compute the mean of the squared differences<br/> mse = np.mean(squared_differences)<br/><br/> return mse<br/>y_true = [3, -0.5, 2, 7]<br/>y_pred = [2.5, 0.0, 2, 8]<br/><br/>mse = mean_squared_error(y_true, y_pred)<br/>print(f"Mean Squared Error: {mse}")<br/>"""<br/><br/><br/>from langchain.text_splitter import PythonCodeTextSplitter<br/>python_splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=0)<br/>documents = python_splitter.create_documents([python_text])<br/><br/>print(documents)</span></pre><p id="7e45" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk">javascript code Splitter :</p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="0a01" class="pf nf gu pc b bg pg ph l pi pj">from langchain.text_splitter import RecursiveCharacterTextSplitter, Language<br/>javascript_text = """<br/>function dotProduct(vectorA, vectorB) {<br/> if (vectorA.length !== vectorB.length) {<br/> throw new Error('Vectors must be of the same length');<br/> }<br/><br/> return vectorA.reduce((sum, currentValue, index) => {<br/> return sum + currentValue * vectorB[index];<br/> }, 0);<br/>}<br/><br/>// Example usage:<br/>const vectorA = [1, 2, 3];<br/>const vectorB = [4, 5, 6];<br/><br/>const result = dotProduct(vectorA, vectorB);<br/>console.log(`Dot Product: ${result}`);<br/>"""<br/>js_splitter = RecursiveCharacterTextSplitter.from_language(<br/> language=Language.JS, chunk_size=65, chunk_overlap=0<br/>)<br/>documents = js_splitter.create_documents([javascript_text])<br/><br/>print(documents)</span></pre><p id="42ae" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Using Llama-index</strong></p><p id="700f" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk">MarkdownNodeParser :</p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="2172" class="pf nf gu pc b bg pg ph l pi pj">from llama_index.core.node_parser import MarkdownNodeParser<br/>from llama_index.core import SimpleDirectoryReader<br/><br/>reader = SimpleDirectoryReader(input_dir="data" ,<br/> required_exts=[".md"])<br/><br/>markdown_docs = reader.load_data()<br/><br/>parser = MarkdownNodeParser()<br/><br/>nodes = parser.get_nodes_from_documents(markdown_docs)<br/><br/><br/>print(nodes)</span></pre><p id="902d" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk">JSONNodeParser :</p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="7b6d" class="pf nf gu pc b bg pg ph l pi pj">from llama_index.core.node_parser import JSONNodeParser<br/>from llama_index.core import SimpleDirectoryReader<br/><br/>parser = JSONNodeParser()<br/><br/>reader = SimpleDirectoryReader(input_dir="data" ,<br/> required_exts=[".json"])<br/><br/>json_docs = reader.load_data()<br/>nodes = parser.get_nodes_from_documents(json_docs)<br/>print(nodes)</span></pre><p id="c950" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk">HTMLNodeParser :</p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="760c" class="pf nf gu pc b bg pg ph l pi pj">from llama_index.core.node_parser import HTMLNodeParser<br/><br/>tags = ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "b", "i", "u", "section"]<br/><br/>parser = HTMLNodeParser(tags=tags)<br/>nodes = parser.get_nodes_from_documents(html_docs)<br/>print(nodes)</span></pre></div></div></div><div class="ab cb pk pl pm pn" role="separator"><span class="po by bm pp pq pr"></span><span class="po by bm pp pq pr"></span><span class="po by bm pp pq"></span></div><div class="gn go gp gq gr"><div class="ab cb"><div class="ci bh fz ga gb gc"><h2 id="e675" class="ne nf gu bf ng nh ni dy nj nk nl ea nm mr nn no np mv nq nr ns mz nt nu nv nw bk">5 / Semantic Chunking</h2><p id="5c7b" class="pw-post-body-paragraph mg mh gu mi b mj nx ml mm mn ny mp mq mr nz mt mu mv oa mx my mz ob nb nc nd gn bk">Semantic chunking aims to group text into chunks based on semantic meaning rather than fixed size or structure. This method uses embeddings to assess the similarity between chunks, ensuring that semantically similar content remains together.</p><p id="d715" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Using Llama-index</strong></p><p id="5625" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk">In LlamaIndex, the <code class="cx ps pt pu pc b"><a class="af pv" href="https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/semantic_splitter" rel="noopener ugc nofollow" target="_blank">SemanticSplitterNodeParser</a></code> class implements this by adaptively selecting breakpoints based on embedding similarity, with configurable parameters such as <code class="cx ps pt pu pc b">buffer_size</code> (initial window size for chunks), <code class="cx ps pt pu pc b">breakpoint_percentile_threshold</code> (split threshold), and <code class="cx ps pt pu pc b">embed_mode</code> (embedding model used).</p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="67c7" class="pf nf gu pc b bg pg ph l pi pj"># pip install llama-index-embeddings-huggingface llama-index-embeddings-instructor<br/><br/>from llama_index.embeddings.huggingface import HuggingFaceEmbedding<br/>embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")<br/><br/><br/>from llama_index.core import SimpleDirectoryReader<br/>from llama_index.core.node_parser import (<br/> SentenceSplitter,<br/> SemanticSplitterNodeParser,<br/>)<br/>splitter = SemanticSplitterNodeParser(<br/> buffer_size=1,<br/> breakpoint_percentile_threshold=95,<br/> embed_model=embed_model<br/>)<br/><br/>base_splitter = SentenceSplitter(chunk_size=512)<br/><br/>documents = SimpleDirectoryReader(input_files=["text.txt"]).load_data()<br/>nodes = splitter.get_nodes_from_documents(documents)<br/><br/>print(nodes)</span></pre><p id="745c" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">Using Langchain</strong></p><p id="1f30" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk">Similarly, Langchain’s <code class="cx ps pt pu pc b"><a class="af pv" href="https://python.langchain.com/v0.2/docs/how_to/semantic-chunker" rel="noopener ugc nofollow" target="_blank">SemanticChunker</a></code> detects sentence boundaries by analyzing embedding differences; sentences are split when the difference exceeds a specified threshold, maintaining semantic coherence within chunks.</p><pre class="ow ox oy oz pa pb pc pd bp pe bb bk"><span id="d489" class="pf nf gu pc b bg pg ph l pi pj"># pip install langchain_experimental fastembed langchain_community<br/><br/>from langchain_community.embeddings.fastembed import FastEmbedEmbeddings<br/>embed_model = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")<br/><br/>from langchain_experimental.text_splitter import SemanticChunker<br/>from langchain.text_splitter import RecursiveCharacterTextSplitter<br/><br/>text_splitter = RecursiveCharacterTextSplitter(<br/> chunk_size=1000,<br/> chunk_overlap=0,<br/> length_function=len,<br/> is_separator_regex=False<br/>)<br/>documents = text_splitter.create_documents([text])<br/><br/><br/>semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")<br/>semantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])<br/><br/>print(semantic_chunks)</span></pre><h2 id="6306" class="ne nf gu bf ng nh ni dy nj nk nl ea nm mr nn no np mv nq nr ns mz nt nu nv nw bk">Choosing the Best Chunking Method</h2><p id="8535" class="pw-post-body-paragraph mg mh gu mi b mj nx ml mm mn ny mp mq mr nz mt mu mv oa mx my mz ob nb nc nd gn bk">Selecting the right chunking strategy depends on your application’s requirements and constraints. For simple, structured content, character splitting or recursive chunking may suffice. For more complex documents, document-specific or semantic chunking might be necessary to preserve context and meaning. Consider model compatibility, task specificity, and system constraints to ensure the optimal chunking method for your needs.</p></div></div></div><div class="ab cb pk pl pm pn" role="separator"><span class="po by bm pp pq pr"></span><span class="po by bm pp pq pr"></span><span class="po by bm pp pq"></span></div><div class="gn go gp gq gr"><div class="ab cb"><div class="ci bh fz ga gb gc"><p id="5208" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">My LinkedIn</strong> : <a class="af pv" href="https://www.linkedin.com/in/ayoub-kirouane3" rel="noopener ugc nofollow" target="_blank">https://www.linkedin.com/in/ayoub-kirouane3</a></p><p id="54f9" class="pw-post-body-paragraph mg mh gu mi b mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd gn bk"><strong class="mi gv">My HuggingFace</strong> : <a class="af pv" href="https://huggingface.co/ayoubkirouane" rel="noopener ugc nofollow" target="_blank">https://huggingface.co/ayoubkirouane</a></p></div></div></div></div></section></div></div></article></div><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="pw px ab iv"><div class="py ab"><a class="pz ay am ao" rel="noopener follow" href="/tag/retrieval-augmented-gen?source=post_page-----d56903b167c5--------------------------------"><div class="qa fj cx qb ge qc qd bf b bg z bk qe">Retrieval Augmented Gen</div></a></div><div class="py ab"><a class="pz ay am ao" rel="noopener follow" href="/tag/rag-system?source=post_page-----d56903b167c5--------------------------------"><div class="qa fj cx qb ge qc qd bf b bg z bk qe">Rag System</div></a></div><div class="py ab"><a class="pz ay am ao" rel="noopener follow" href="/tag/text-chunking?source=post_page-----d56903b167c5--------------------------------"><div class="qa fj cx qb ge qc qd bf b bg z bk qe">Text Chunking</div></a></div><div class="py ab"><a class="pz ay am ao" rel="noopener follow" href="/tag/large-language-models?source=post_page-----d56903b167c5--------------------------------"><div class="qa fj cx qb ge qc qd bf b bg z bk qe">Large Language Models</div></a></div><div class="py ab"><a class="pz ay am ao" rel="noopener follow" href="/tag/genai?source=post_page-----d56903b167c5--------------------------------"><div class="qa fj cx qb ge qc qd bf b bg z bk qe">Genai</div></a></div></div></div></div><div class="l"></div><footer class="qf pl qg qh qi ab q qj qk c"><div class="l ae"><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="ab cp ql"><div class="ab q ke"><div class="qm l"><span class="l qn qo qp e d"><div class="ab q ke kf"><div class="pw-multi-vote-icon fj kg kh ki kj"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="footerClapButton" rel="noopener follow" href="/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fvote%2Fp%2Fd56903b167c5&operation=register&redirect=https%3A%2F%2Fmedium.com%2F%40ayoubkirouane3%2Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5&user=kirouane+Ayoub&userId=4751fd7878c5&source=---footer_actions--d56903b167c5---------------------clap_footer-----------"><div><div class="bm" aria-hidden="false"><div class="kk ao kl km kn ko am kp kq kr kj"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" aria-label="clap"><path fill-rule="evenodd" d="M11.37.828 12 3.282l.63-2.454zM13.916 3.953l1.523-2.112-1.184-.39zM8.589 1.84l1.522 2.112-.337-2.501zM18.523 18.92c-.86.86-1.75 1.246-2.62 1.33a6 6 0 0 0 .407-.372c2.388-2.389 2.86-4.951 1.399-7.623l-.912-1.603-.79-1.672c-.26-.56-.194-.98.203-1.288a.7.7 0 0 1 .546-.132c.283.046.546.231.728.5l2.363 4.157c.976 1.624 1.141 4.237-1.324 6.702m-10.999-.438L3.37 14.328a.828.828 0 0 1 .585-1.408.83.83 0 0 1 .585.242l2.158 2.157a.365.365 0 0 0 .516-.516l-2.157-2.158-1.449-1.449a.826.826 0 0 1 1.167-1.17l3.438 3.44a.363.363 0 0 0 .516 0 .364.364 0 0 0 0-.516L5.293 9.513l-.97-.97a.826.826 0 0 1 0-1.166.84.84 0 0 1 1.167 0l.97.968 3.437 3.436a.36.36 0 0 0 .517 0 .366.366 0 0 0 0-.516L6.977 7.83a.82.82 0 0 1-.241-.584.82.82 0 0 1 .824-.826c.219 0 .43.087.584.242l5.787 5.787a.366.366 0 0 0 .587-.415l-1.117-2.363c-.26-.56-.194-.98.204-1.289a.7.7 0 0 1 .546-.132c.283.046.545.232.727.501l2.193 3.86c1.302 2.38.883 4.59-1.277 6.75-1.156 1.156-2.602 1.627-4.19 1.367-1.418-.236-2.866-1.033-4.079-2.246M10.75 5.971l2.12 2.12c-.41.502-.465 1.17-.128 1.89l.22.465-3.523-3.523a.8.8 0 0 1-.097-.368c0-.22.086-.428.241-.584a.847.847 0 0 1 1.167 0m7.355 1.705c-.31-.461-.746-.758-1.23-.837a1.44 1.44 0 0 0-1.11.275c-.312.24-.505.543-.59.881a1.74 1.74 0 0 0-.906-.465 1.47 1.47 0 0 0-.82.106l-2.182-2.182a1.56 1.56 0 0 0-2.2 0 1.54 1.54 0 0 0-.396.701 1.56 1.56 0 0 0-2.21-.01 1.55 1.55 0 0 0-.416.753c-.624-.624-1.649-.624-2.237-.037a1.557 1.557 0 0 0 0 2.2c-.239.1-.501.238-.715.453a1.56 1.56 0 0 0 0 2.2l.516.515a1.556 1.556 0 0 0-.753 2.615L7.01 19c1.32 1.319 2.909 2.189 4.475 2.449q.482.08.971.08c.85 0 1.653-.198 2.393-.579.231.033.46.054.686.054 1.266 0 2.457-.52 3.505-1.567 2.763-2.763 2.552-5.734 1.439-7.586z" clip-rule="evenodd"></path></svg></div></div></div></a></span></div><div class="pw-multi-vote-count l ks kt ku kv kw kx ky"><p class="bf b dv z du"><span class="kz">--</span></p></div></div></span><span class="l h g f qq qr"><div class="ab q ke kf"><div class="pw-multi-vote-icon fj kg kh ki kj"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="footerClapButton" rel="noopener follow" href="/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fvote%2Fp%2Fd56903b167c5&operation=register&redirect=https%3A%2F%2Fmedium.com%2F%40ayoubkirouane3%2Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5&user=kirouane+Ayoub&userId=4751fd7878c5&source=---footer_actions--d56903b167c5---------------------clap_footer-----------"><div><div class="bm" aria-hidden="false"><div class="kk ao kl km kn ko am kp kq kr kj"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" aria-label="clap"><path fill-rule="evenodd" d="M11.37.828 12 3.282l.63-2.454zM13.916 3.953l1.523-2.112-1.184-.39zM8.589 1.84l1.522 2.112-.337-2.501zM18.523 18.92c-.86.86-1.75 1.246-2.62 1.33a6 6 0 0 0 .407-.372c2.388-2.389 2.86-4.951 1.399-7.623l-.912-1.603-.79-1.672c-.26-.56-.194-.98.203-1.288a.7.7 0 0 1 .546-.132c.283.046.546.231.728.5l2.363 4.157c.976 1.624 1.141 4.237-1.324 6.702m-10.999-.438L3.37 14.328a.828.828 0 0 1 .585-1.408.83.83 0 0 1 .585.242l2.158 2.157a.365.365 0 0 0 .516-.516l-2.157-2.158-1.449-1.449a.826.826 0 0 1 1.167-1.17l3.438 3.44a.363.363 0 0 0 .516 0 .364.364 0 0 0 0-.516L5.293 9.513l-.97-.97a.826.826 0 0 1 0-1.166.84.84 0 0 1 1.167 0l.97.968 3.437 3.436a.36.36 0 0 0 .517 0 .366.366 0 0 0 0-.516L6.977 7.83a.82.82 0 0 1-.241-.584.82.82 0 0 1 .824-.826c.219 0 .43.087.584.242l5.787 5.787a.366.366 0 0 0 .587-.415l-1.117-2.363c-.26-.56-.194-.98.204-1.289a.7.7 0 0 1 .546-.132c.283.046.545.232.727.501l2.193 3.86c1.302 2.38.883 4.59-1.277 6.75-1.156 1.156-2.602 1.627-4.19 1.367-1.418-.236-2.866-1.033-4.079-2.246M10.75 5.971l2.12 2.12c-.41.502-.465 1.17-.128 1.89l.22.465-3.523-3.523a.8.8 0 0 1-.097-.368c0-.22.086-.428.241-.584a.847.847 0 0 1 1.167 0m7.355 1.705c-.31-.461-.746-.758-1.23-.837a1.44 1.44 0 0 0-1.11.275c-.312.24-.505.543-.59.881a1.74 1.74 0 0 0-.906-.465 1.47 1.47 0 0 0-.82.106l-2.182-2.182a1.56 1.56 0 0 0-2.2 0 1.54 1.54 0 0 0-.396.701 1.56 1.56 0 0 0-2.21-.01 1.55 1.55 0 0 0-.416.753c-.624-.624-1.649-.624-2.237-.037a1.557 1.557 0 0 0 0 2.2c-.239.1-.501.238-.715.453a1.56 1.56 0 0 0 0 2.2l.516.515a1.556 1.556 0 0 0-.753 2.615L7.01 19c1.32 1.319 2.909 2.189 4.475 2.449q.482.08.971.08c.85 0 1.653-.198 2.393-.579.231.033.46.054.686.054 1.266 0 2.457-.52 3.505-1.567 2.763-2.763 2.552-5.734 1.439-7.586z" clip-rule="evenodd"></path></svg></div></div></div></a></span></div><div class="pw-multi-vote-count l ks kt ku kv kw kx ky"><p class="bf b dv z du"><span class="kz">--</span></p></div></div></span></div><div class="bq ab"><div><div class="bm" aria-hidden="false"><button class="ao kk la lb ab q fk lc ld" aria-label="responses"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" class="le"><path d="M18.006 16.803c1.533-1.456 2.234-3.325 2.234-5.321C20.24 7.357 16.709 4 12.191 4S4 7.357 4 11.482c0 4.126 3.674 7.482 8.191 7.482.817 0 1.622-.111 2.393-.327.231.2.48.391.744.559 1.06.693 2.203 1.044 3.399 1.044.224-.008.4-.112.486-.287a.49.49 0 0 0-.042-.518c-.495-.67-.845-1.364-1.04-2.057a4 4 0 0 1-.125-.598zm-3.122 1.055-.067-.223-.315.096a8 8 0 0 1-2.311.338c-4.023 0-7.292-2.955-7.292-6.587 0-3.633 3.269-6.588 7.292-6.588 4.014 0 7.112 2.958 7.112 6.593 0 1.794-.608 3.469-2.027 4.72l-.195.168v.255c0 .056 0 .151.016.295.025.231.081.478.154.733.154.558.398 1.117.722 1.659a5.3 5.3 0 0 1-2.165-.845c-.276-.176-.714-.383-.941-.59z"></path></svg></button></div></div></div></div><div class="ab q"><div class="pr l is"><div><div class="bm" aria-hidden="false"><span><a class="af ag ah ai aj ak al am an ao ap aq ar as at" data-testid="footerBookmarkButton" rel="noopener follow" href="/m/signin?actionUrl=https%3A%2F%2Fmedium.com%2F_%2Fbookmark%2Fp%2Fd56903b167c5&operation=register&redirect=https%3A%2F%2Fmedium.com%2F%40ayoubkirouane3%2Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5&source=---footer_actions--d56903b167c5---------------------bookmark_footer-----------"><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25" class="du lg" aria-label="Add to list bookmark button"><path fill="currentColor" d="M18 2.5a.5.5 0 0 1 1 0V5h2.5a.5.5 0 0 1 0 1H19v2.5a.5.5 0 1 1-1 0V6h-2.5a.5.5 0 0 1 0-1H18zM7 7a1 1 0 0 1 1-1h3.5a.5.5 0 0 0 0-1H8a2 2 0 0 0-2 2v14a.5.5 0 0 0 .805.396L12.5 17l5.695 4.396A.5.5 0 0 0 19 21v-8.5a.5.5 0 0 0-1 0v7.485l-5.195-4.012a.5.5 0 0 0-.61 0L7 19.985z"></path></svg></a></span></div></div></div><div class="pr l is"><div class="bm" aria-hidden="false" aria-describedby="postFooterSocialMenu" aria-labelledby="postFooterSocialMenu"><div><div class="bm" aria-hidden="false"><button aria-controls="postFooterSocialMenu" aria-expanded="false" aria-label="Share Post" data-testid="footerSocialShareButton" class="af fk ah ai aj ak al lo an ao ap ex lp lq ld lr"><svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" fill="none" viewBox="0 0 24 24"><path fill="currentColor" fill-rule="evenodd" d="M15.218 4.931a.4.4 0 0 1-.118.132l.012.006a.45.45 0 0 1-.292.074.5.5 0 0 1-.3-.13l-2.02-2.02v7.07c0 .28-.23.5-.5.5s-.5-.22-.5-.5v-7.04l-2 2a.45.45 0 0 1-.57.04h-.02a.4.4 0 0 1-.16-.3.4.4 0 0 1 .1-.32l2.8-2.8a.5.5 0 0 1 .7 0l2.8 2.79a.42.42 0 0 1 .068.498m-.106.138.008.004v-.01zM16 7.063h1.5a2 2 0 0 1 2 2v10a2 2 0 0 1-2 2h-11c-1.1 0-2-.9-2-2v-10a2 2 0 0 1 2-2H8a.5.5 0 0 1 .35.15.5.5 0 0 1 .15.35.5.5 0 0 1-.15.35.5.5 0 0 1-.35.15H6.4c-.5 0-.9.4-.9.9v10.2a.9.9 0 0 0 .9.9h11.2c.5 0 .9-.4.9-.9v-10.2c0-.5-.4-.9-.9-.9H16a.5.5 0 0 1 0-1" clip-rule="evenodd"></path></svg></button></div></div></div></div></div></div></div></div></div></footer><div class="qs qt qu qv qw l"><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="qx bh r qy"></div><div class="ab qz ra rb iu it"><div class="rc rd re rf rg rh ri rj rk rl ab cp"><div class="h k"><a tabindex="0" rel="noopener follow" href="/@ayoubkirouane3?source=post_page---post_author_info--d56903b167c5--------------------------------"><div class="l fj"><img alt="kirouane Ayoub" class="l fd by ic ib cx" src="https://miro.medium.com/v2/resize:fill:96:96/1*T-KWhmfASlLM3XMvRKZnWA.jpeg" width="48" height="48" loading="lazy"/><div class="fr by l ic ib fs n ay rm"></div></div></a></div><div class="j i d"><a tabindex="0" rel="noopener follow" href="/@ayoubkirouane3?source=post_page---post_author_info--d56903b167c5--------------------------------"><div class="l fj"><img alt="kirouane Ayoub" class="l fd by rn ro cx" src="https://miro.medium.com/v2/resize:fill:128:128/1*T-KWhmfASlLM3XMvRKZnWA.jpeg" width="64" height="64" loading="lazy"/><div class="fr by l rn ro fs n ay rm"></div></div></a></div><div class="j i d rp is"><div class="ab"><span><button class="bf b bg z rq qa rr rs rt ru rv ev ew rw rx ry fa fb fc fd bm fe ff">Follow</button></span></div></div></div><div class="ab co rz"><div class="sa sb sc sd se l"><a class="af ag ah aj ak al am an ao ap aq ar as at ab q" rel="noopener follow" href="/@ayoubkirouane3?source=post_page---post_author_info--d56903b167c5--------------------------------"><h2 class="pw-author-name bf sg sh si sj sk sl sm mr no np mv nr ns mz nu nv bk"><span class="gn sf">Written by <!-- -->kirouane Ayoub</span></h2></a><div class="py ab ia"><div class="l is"><span class="pw-follower-count bf b bg z du"><a class="af ag ah ai aj ak al am an ao ap aq ar il" rel="noopener follow" href="/@ayoubkirouane3/followers?source=post_page---post_author_info--d56903b167c5--------------------------------">309 Followers</a></span></div><div class="bf b bg z du ab sn"><span class="im l" aria-hidden="true"><span class="bf b bg z du">·</span></span><a class="af ag ah ai aj ak al am an ao ap aq ar il" rel="noopener follow" href="/@ayoubkirouane3/following?source=post_page---post_author_info--d56903b167c5--------------------------------">2 Following</a></div></div><div class="so l"><p class="bf b bg z bk"><span class="gn">I Like building Machine Learning models from scratch .</span></p></div></div></div><div class="h k"><div class="ab"><span><button class="bf b bg z rq qa rr rs rt ru rv ev ew rw rx ry fa fb fc fd bm fe ff">Follow</button></span></div></div></div></div></div></div><div class="sp l"><div class="qx bh r sq sr ss st su"></div><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="ab q cp"><h2 class="bf sg od of nj og oi nm oj ol om on op oq or ot ou bk">No responses yet</h2><div class="ab sv"><div><div class="bm" aria-hidden="false"><a class="sw sx" href="https://policy.medium.com/medium-rules-30e5502c4eb4?source=post_page---post_responses--d56903b167c5--------------------------------" rel="noopener follow" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" viewBox="0 0 25 25"><path fill-rule="evenodd" d="M11.987 5.036a.754.754 0 0 1 .914-.01c.972.721 1.767 1.218 2.6 1.543.828.322 1.719.485 2.887.505a.755.755 0 0 1 .741.757c-.018 3.623-.43 6.256-1.449 8.21-1.034 1.984-2.662 3.209-4.966 4.083a.75.75 0 0 1-.537-.003c-2.243-.874-3.858-2.095-4.897-4.074-1.024-1.951-1.457-4.583-1.476-8.216a.755.755 0 0 1 .741-.757c1.195-.02 2.1-.182 2.923-.503.827-.322 1.6-.815 2.519-1.535m.468.903c-.897.69-1.717 1.21-2.623 1.564-.898.35-1.856.527-3.026.565.037 3.45.469 5.817 1.36 7.515.884 1.684 2.25 2.762 4.284 3.571 2.092-.81 3.465-1.89 4.344-3.575.886-1.698 1.299-4.065 1.334-7.512-1.149-.039-2.091-.217-2.99-.567-.906-.353-1.745-.873-2.683-1.561m-.009 9.155a2.672 2.672 0 1 0 0-5.344 2.672 2.672 0 0 0 0 5.344m0 1a3.672 3.672 0 1 0 0-7.344 3.672 3.672 0 0 0 0 7.344m-1.813-3.777.525-.526.916.917 1.623-1.625.526.526-2.149 2.152z" clip-rule="evenodd"></path></svg></a></div></div></div></div></div></div></div><div class="sy sz ta tb tc l bx"><div class="h k j"><div class="qx bh td te"></div><div class="ab cb"><div class="ci bh fz ga gb gc"><div class="tf ab ke iv"><div class="tg th l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://help.medium.com/hc/en-us?source=post_page-----d56903b167c5--------------------------------" rel="noopener follow"><p class="bf b dv z du">Help</p></a></div><div class="tg th l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://medium.statuspage.io/?source=post_page-----d56903b167c5--------------------------------" rel="noopener follow"><p class="bf b dv z du">Status</p></a></div><div class="tg th l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" rel="noopener follow" href="/about?autoplay=1&source=post_page-----d56903b167c5--------------------------------"><p class="bf b dv z du">About</p></a></div><div class="tg th l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" rel="noopener follow" href="/jobs-at-medium/work-at-medium-959d1a85284e?source=post_page-----d56903b167c5--------------------------------"><p class="bf b dv z du">Careers</p></a></div><div class="tg th l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="pressinquiries@medium.com?source=post_page-----d56903b167c5--------------------------------" rel="noopener follow"><p class="bf b dv z du">Press</p></a></div><div class="tg th l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://blog.medium.com/?source=post_page-----d56903b167c5--------------------------------" rel="noopener follow"><p class="bf b dv z du">Blog</p></a></div><div class="tg th l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://policy.medium.com/medium-privacy-policy-f03bf92035c9?source=post_page-----d56903b167c5--------------------------------" rel="noopener follow"><p class="bf b dv z du">Privacy</p></a></div><div class="tg th l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://policy.medium.com/medium-terms-of-service-9db0094a1e0f?source=post_page-----d56903b167c5--------------------------------" rel="noopener follow"><p class="bf b dv z du">Terms</p></a></div><div class="tg th l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" href="https://speechify.com/medium?source=post_page-----d56903b167c5--------------------------------" rel="noopener follow"><p class="bf b dv z du">Text to speech</p></a></div><div class="tg l"><a class="af ag ah ai aj ak al am an ao ap aq ar as at" rel="noopener follow" href="/business?source=post_page-----d56903b167c5--------------------------------"><p class="bf b dv z du">Teams</p></a></div></div></div></div></div></div></div></div></div></div><script>window.__BUILD_ID__="main-20241126-181518-0cb59a020f"</script><script>window.__GRAPHQL_URI__ = "https://medium.com/_/graphql"</script><script>window.__PRELOADED_STATE__ = {"algolia":{"queries":{}},"cache":{"experimentGroupSet":true,"reason":"","group":"enabled","tags":["group-edgeCachePosts","post-d56903b167c5","user-4751fd7878c5"],"serverVariantState":"44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a","middlewareEnabled":true,"cacheStatus":"DYNAMIC","shouldUseCache":true,"vary":[],"lohpSummerUpsellEnabled":false,"publicationHierarchyEnabledWeb":false,"postBottomResponsesEnabled":false},"client":{"hydrated":false,"isUs":false,"isNativeMedium":false,"isSafariMobile":false,"isSafari":false,"isFirefox":false,"routingEntity":{"type":"DEFAULT","explicit":false},"viewerIsBot":false},"debug":{"requestId":"5e46cd15-ed09-455d-aa6f-b2c1e37b18b0","hybridDevServices":[],"originalSpanCarrier":{"traceparent":"00-c4d2c2004374a83bcafed66742cbc11f-b2e531b261a66e33-01"}},"multiVote":{"clapsPerPost":{}},"navigation":{"branch":{"show":null,"hasRendered":null,"blockedByCTA":false},"hideGoogleOneTap":false,"hasRenderedAlternateUserBanner":null,"currentLocation":"https:\u002F\u002Fmedium.com\u002F@ayoubkirouane3\u002Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5","host":"medium.com","hostname":"medium.com","referrer":"","hasSetReferrer":false,"susiModal":{"step":null,"operation":"register"},"postRead":false,"partnerProgram":{"selectedCountryCode":null},"queryString":"","currentHash":""},"config":{"nodeEnv":"production","version":"main-20241126-181518-0cb59a020f","target":"production","productName":"Medium","publicUrl":"https:\u002F\u002Fcdn-client.medium.com\u002Flite","authDomain":"medium.com","authGoogleClientId":"216296035834-k1k6qe060s2tp2a2jam4ljdcms00sttg.apps.googleusercontent.com","favicon":"production","glyphUrl":"https:\u002F\u002Fglyph.medium.com","branchKey":"key_live_ofxXr2qTrrU9NqURK8ZwEhknBxiI6KBm","algolia":{"appId":"MQ57UUUQZ2","apiKeySearch":"394474ced050e3911ae2249ecc774921","indexPrefix":"medium_","host":"-dsn.algolia.net"},"recaptchaKey":"6Lfc37IUAAAAAKGGtC6rLS13R1Hrw_BqADfS1LRk","recaptcha3Key":"6Lf8R9wUAAAAABMI_85Wb8melS7Zj6ziuf99Yot5","recaptchaEnterpriseKeyId":"6Le-uGgpAAAAAPprRaokM8AKthQ9KNGdoxaGUvVp","datadog":{"applicationId":"6702d87d-a7e0-42fe-bbcb-95b469547ea0","clientToken":"pub853ea8d17ad6821d9f8f11861d23dfed","rumToken":"pubf9cc52896502b9413b68ba36fc0c7162","context":{"deployment":{"target":"production","tag":"main-20241126-181518-0cb59a020f","commit":"0cb59a020f4453d0900f671f1a6576feecc55e74"}},"datacenter":"us"},"googleAnalyticsCode":"G-7JY7T788PK","googlePay":{"apiVersion":"2","apiVersionMinor":"0","merchantId":"BCR2DN6TV7EMTGBM","merchantName":"Medium","instanceMerchantId":"13685562959212738550"},"applePay":{"version":3},"signInWallCustomDomainCollectionIds":["3a8144eabfe3","336d898217ee","61061eb0c96b","138adf9c44c","819cc2aaeee0"],"mediumMastodonDomainName":"me.dm","mediumOwnedAndOperatedCollectionIds":["8a9336e5bb4","b7e45b22fec3","193b68bd4fba","8d6b8a439e32","54c98c43354d","3f6ecf56618","d944778ce714","92d2092dc598","ae2a65f35510","1285ba81cada","544c7006046e","fc8964313712","40187e704f1c","88d9857e584e","7b6769f2748b","bcc38c8f6edf","cef6983b292","cb8577c9149e","444d13b52878","713d7dbc99b0","ef8e90590e66","191186aaafa0","55760f21cdc5","9dc80918cc93","bdc4052bbdba","8ccfed20cbb2"],"tierOneDomains":["medium.com","thebolditalic.com","arcdigital.media","towardsdatascience.com","uxdesign.cc","codeburst.io","psiloveyou.xyz","writingcooperative.com","entrepreneurshandbook.co","prototypr.io","betterhumans.coach.me","theascent.pub"],"topicsToFollow":["d61cf867d93f","8a146bc21b28","1eca0103fff3","4d562ee63426","aef1078a3ef5","e15e46793f8d","6158eb913466","55f1c20aba7a","3d18b94f6858","4861fee224fd","63c6f1f93ee","1d98b3a9a871","decb52b64abf","ae5d4995e225","830cded25262"],"topicToTagMappings":{"accessibility":"accessibility","addiction":"addiction","android-development":"android-development","art":"art","artificial-intelligence":"artificial-intelligence","astrology":"astrology","basic-income":"basic-income","beauty":"beauty","biotech":"biotech","blockchain":"blockchain","books":"books","business":"business","cannabis":"cannabis","cities":"cities","climate-change":"climate-change","comics":"comics","coronavirus":"coronavirus","creativity":"creativity","cryptocurrency":"cryptocurrency","culture":"culture","cybersecurity":"cybersecurity","data-science":"data-science","design":"design","digital-life":"digital-life","disability":"disability","economy":"economy","education":"education","equality":"equality","family":"family","feminism":"feminism","fiction":"fiction","film":"film","fitness":"fitness","food":"food","freelancing":"freelancing","future":"future","gadgets":"gadgets","gaming":"gaming","gun-control":"gun-control","health":"health","history":"history","humor":"humor","immigration":"immigration","ios-development":"ios-development","javascript":"javascript","justice":"justice","language":"language","leadership":"leadership","lgbtqia":"lgbtqia","lifestyle":"lifestyle","machine-learning":"machine-learning","makers":"makers","marketing":"marketing","math":"math","media":"media","mental-health":"mental-health","mindfulness":"mindfulness","money":"money","music":"music","neuroscience":"neuroscience","nonfiction":"nonfiction","outdoors":"outdoors","parenting":"parenting","pets":"pets","philosophy":"philosophy","photography":"photography","podcasts":"podcast","poetry":"poetry","politics":"politics","privacy":"privacy","product-management":"product-management","productivity":"productivity","programming":"programming","psychedelics":"psychedelics","psychology":"psychology","race":"race","relationships":"relationships","religion":"religion","remote-work":"remote-work","san-francisco":"san-francisco","science":"science","self":"self","self-driving-cars":"self-driving-cars","sexuality":"sexuality","social-media":"social-media","society":"society","software-engineering":"software-engineering","space":"space","spirituality":"spirituality","sports":"sports","startups":"startup","style":"style","technology":"technology","transportation":"transportation","travel":"travel","true-crime":"true-crime","tv":"tv","ux":"ux","venture-capital":"venture-capital","visual-design":"visual-design","work":"work","world":"world","writing":"writing"},"defaultImages":{"avatar":{"imageId":"1*dmbNkD5D-u45r44go_cf0g.png","height":150,"width":150},"orgLogo":{"imageId":"7*V1_7XP4snlmqrc_0Njontw.png","height":110,"width":500},"postLogo":{"imageId":"bd978bb536350a710e8efb012513429cabdc4c28700604261aeda246d0f980b7","height":810,"width":1440},"postPreviewImage":{"imageId":"1*hn4v1tCaJy7cWMyb0bpNpQ.png","height":386,"width":579}},"collectionStructuredData":{"8d6b8a439e32":{"name":"Elemental","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F980\u002F1*9ygdqoKprhwuTVKUM0DLPA@2x.png","width":980,"height":159}}},"3f6ecf56618":{"name":"Forge","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F596\u002F1*uULpIlImcO5TDuBZ6lm7Lg@2x.png","width":596,"height":183}}},"ae2a65f35510":{"name":"GEN","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F264\u002F1*RdVZMdvfV3YiZTw6mX7yWA.png","width":264,"height":140}}},"88d9857e584e":{"name":"LEVEL","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F540\u002F1*JqYMhNX6KNNb2UlqGqO2WQ.png","width":540,"height":108}}},"7b6769f2748b":{"name":"Marker","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fcdn-images-1.medium.com\u002Fmax\u002F383\u002F1*haCUs0wF6TgOOvfoY-jEoQ@2x.png","width":383,"height":92}}},"444d13b52878":{"name":"OneZero","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F540\u002F1*cw32fIqCbRWzwJaoQw6BUg.png","width":540,"height":123}}},"8ccfed20cbb2":{"name":"Zora","data":{"@type":"NewsMediaOrganization","ethicsPolicy":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Farticles\u002F360043290473","logo":{"@type":"ImageObject","url":"https:\u002F\u002Fmiro.medium.com\u002Fmax\u002F540\u002F1*tZUQqRcCCZDXjjiZ4bDvgQ.png","width":540,"height":106}}}},"embeddedPostIds":{"coronavirus":"cd3010f9d81f"},"sharedCdcMessaging":{"COVID_APPLICABLE_TAG_SLUGS":[],"COVID_APPLICABLE_TOPIC_NAMES":[],"COVID_APPLICABLE_TOPIC_NAMES_FOR_TOPIC_PAGE":[],"COVID_MESSAGES":{"tierA":{"text":"For more information on the novel coronavirus and Covid-19, visit cdc.gov.","markups":[{"start":66,"end":73,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]},"tierB":{"text":"Anyone can publish on Medium per our Policies, but we don’t fact-check every story. For more info about the coronavirus, see cdc.gov.","markups":[{"start":37,"end":45,"href":"https:\u002F\u002Fhelp.medium.com\u002Fhc\u002Fen-us\u002Fcategories\u002F201931128-Policies-Safety"},{"start":125,"end":132,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]},"paywall":{"text":"This article has been made free for everyone, thanks to Medium Members. For more information on the novel coronavirus and Covid-19, visit cdc.gov.","markups":[{"start":56,"end":70,"href":"https:\u002F\u002Fmedium.com\u002Fmembership"},{"start":138,"end":145,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]},"unbound":{"text":"This article is free for everyone, thanks to Medium Members. For more information on the novel coronavirus and Covid-19, visit cdc.gov.","markups":[{"start":45,"end":59,"href":"https:\u002F\u002Fmedium.com\u002Fmembership"},{"start":127,"end":134,"href":"https:\u002F\u002Fwww.cdc.gov\u002Fcoronavirus\u002F2019-nCoV"}]}},"COVID_BANNER_POST_ID_OVERRIDE_WHITELIST":["3b31a67bff4a"]},"sharedVoteMessaging":{"TAGS":["politics","election-2020","government","us-politics","election","2020-presidential-race","trump","donald-trump","democrats","republicans","congress","republican-party","democratic-party","biden","joe-biden","maga"],"TOPICS":["politics","election"],"MESSAGE":{"text":"Find out more about the U.S. election results here.","markups":[{"start":46,"end":50,"href":"https:\u002F\u002Fcookpolitical.com\u002F2020-national-popular-vote-tracker"}]},"EXCLUDE_POSTS":["397ef29e3ca5"]},"embedPostRules":[],"recircOptions":{"v1":{"limit":3},"v2":{"limit":8}},"braintreeClientKey":"production_zjkj96jm_m56f8fqpf7ngnrd4","braintree":{"enabled":true,"merchantId":"m56f8fqpf7ngnrd4","merchantAccountId":{"usd":"AMediumCorporation_instant","eur":"amediumcorporation_EUR","cad":"amediumcorporation_CAD"},"publicKey":"ds2nn34bg2z7j5gd","braintreeEnvironment":"production","dashboardUrl":"https:\u002F\u002Fwww.braintreegateway.com\u002Fmerchants","gracePeriodDurationInDays":14,"mediumMembershipPlanId":{"monthly":"ce105f8c57a3","monthlyV2":"e8a5e126-792b-4ee6-8fba-d574c1b02fc5","monthlyWithTrial":"d5ee3dbe3db8","monthlyPremium":"fa741a9b47a2","yearly":"a40ad4a43185","yearlyV2":"3815d7d6-b8ca-4224-9b8c-182f9047866e","yearlyStaff":"d74fb811198a","yearlyWithTrial":"b3bc7350e5c7","yearlyPremium":"e21bd2c12166","monthlyOneYearFree":"e6c0637a-2bad-4171-ab4f-3c268633d83c","monthly25PercentOffFirstYear":"235ecc62-0cdb-49ae-9378-726cd21c504b","monthly20PercentOffFirstYear":"ba518864-9c13-4a99-91ca-411bf0cac756","monthly15PercentOffFirstYear":"594c029b-9f89-43d5-88f8-8173af4e070e","monthly10PercentOffFirstYear":"c6c7bc9a-40f2-4b51-8126-e28511d5bdb0","monthlyForStudents":"629ebe51-da7d-41fd-8293-34cd2f2030a8","yearlyOneYearFree":"78ba7be9-0d9f-4ece-aa3e-b54b826f2bf1","yearly25PercentOffFirstYear":"2dbb010d-bb8f-4eeb-ad5c-a08509f42d34","yearly20PercentOffFirstYear":"47565488-435b-47f8-bf93-40d5fbe0ebc8","yearly15PercentOffFirstYear":"8259809b-0881-47d9-acf7-6c001c7f720f","yearly10PercentOffFirstYear":"9dd694fb-96e1-472c-8d9e-3c868d5c1506","yearlyForStudents":"e29345ef-ab1c-4234-95c5-70e50fe6bc23","monthlyCad":"p52orjkaceei","yearlyCad":"h4q9g2up9ktt"},"braintreeDiscountId":{"oneMonthFree":"MONTHS_FREE_01","threeMonthsFree":"MONTHS_FREE_03","sixMonthsFree":"MONTHS_FREE_06","fiftyPercentOffOneYear":"FIFTY_PERCENT_OFF_ONE_YEAR"},"3DSecureVersion":"2","defaultCurrency":"usd","providerPlanIdCurrency":{"4ycw":"usd","rz3b":"usd","3kqm":"usd","jzw6":"usd","c2q2":"usd","nnsw":"usd","q8qw":"usd","d9y6":"usd","fx7w":"cad","nwf2":"cad"}},"paypalClientId":"AXj1G4fotC2GE8KzWX9mSxCH1wmPE3nJglf4Z2ig_amnhvlMVX87otaq58niAg9iuLktVNF_1WCMnN7v","paypal":{"host":"https:\u002F\u002Fapi.paypal.com:443","clientMode":"production","serverMode":"live","webhookId":"4G466076A0294510S","monthlyPlan":{"planId":"P-9WR0658853113943TMU5FDQA","name":"Medium Membership (Monthly) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"yearlyPlan":{"planId":"P-7N8963881P8875835MU5JOPQ","name":"Medium Membership (Annual) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"oneYearGift":{"name":"Medium Membership (1 Year, Digital Gift Code)","description":"Unlimited access to the best and brightest stories on Medium. Gift codes can be redeemed at medium.com\u002Fredeem.","price":"50.00","currency":"USD","sku":"membership-gift-1-yr"},"oldMonthlyPlan":{"planId":"P-96U02458LM656772MJZUVH2Y","name":"Medium Membership (Monthly)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"oldYearlyPlan":{"planId":"P-59P80963JF186412JJZU3SMI","name":"Medium Membership (Annual)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"monthlyPlanWithTrial":{"planId":"P-66C21969LR178604GJPVKUKY","name":"Medium Membership (Monthly) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"yearlyPlanWithTrial":{"planId":"P-6XW32684EX226940VKCT2MFA","name":"Medium Membership (Annual) with setup fee","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"oldMonthlyPlanNoSetupFee":{"planId":"P-4N046520HR188054PCJC7LJI","name":"Medium Membership (Monthly)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed monthly."},"oldYearlyPlanNoSetupFee":{"planId":"P-7A4913502Y5181304CJEJMXQ","name":"Medium Membership (Annual)","description":"Unlimited access to the best and brightest stories on Medium. Membership billed annually."},"sdkUrl":"https:\u002F\u002Fwww.paypal.com\u002Fsdk\u002Fjs"},"stripePublishableKey":"pk_live_7FReX44VnNIInZwrIIx6ghjl","log":{"json":true,"level":"info"},"imageUploadMaxSizeMb":25,"staffPicks":{"title":"Staff Picks","catalogId":"c7bc6e1ee00f"}},"session":{"xsrf":""}}</script><script>window.__APOLLO_STATE__ = {"ROOT_QUERY":{"__typename":"Query","viewer":null,"collectionByDomainOrSlug({\"domainOrSlug\":\"medium.com\"})":null,"postResult({\"id\":\"d56903b167c5\"})":{"__ref":"Post:d56903b167c5"}},"LinkedAccounts:4751fd7878c5":{"__typename":"LinkedAccounts","mastodon":null,"id":"4751fd7878c5"},"UserViewerEdge:userId:4751fd7878c5-viewerId:lo_ea78ee978563":{"__typename":"UserViewerEdge","id":"userId:4751fd7878c5-viewerId:lo_ea78ee978563","isFollowing":false,"isUser":false,"isMuting":false},"NewsletterV3:3c36e1d68dc9":{"__typename":"NewsletterV3","id":"3c36e1d68dc9","type":"NEWSLETTER_TYPE_AUTHOR","slug":"4751fd7878c5","name":"4751fd7878c5","collection":null,"user":{"__ref":"User:4751fd7878c5"}},"User:4751fd7878c5":{"__typename":"User","id":"4751fd7878c5","name":"kirouane Ayoub","username":"ayoubkirouane3","newsletterV3":{"__ref":"NewsletterV3:3c36e1d68dc9"},"linkedAccounts":{"__ref":"LinkedAccounts:4751fd7878c5"},"isSuspended":false,"imageId":"1*T-KWhmfASlLM3XMvRKZnWA.jpeg","mediumMemberAt":0,"verifications":{"__typename":"VerifiedInfo","isBookAuthor":false},"socialStats":{"__typename":"SocialStats","followerCount":309,"followingCount":0,"collectionFollowingCount":2},"customDomainState":null,"hasSubdomain":false,"bio":"I Like building Machine Learning models from scratch .","isPartnerProgramEnrolled":false,"viewerEdge":{"__ref":"UserViewerEdge:userId:4751fd7878c5-viewerId:lo_ea78ee978563"},"viewerIsUser":false,"postSubscribeMembershipUpsellShownAt":0,"membership":null,"allowNotes":true,"twitterScreenName":""},"Paragraph:afec9737778e_0":{"__typename":"Paragraph","id":"afec9737778e_0","name":"fe01","type":"H3","href":null,"layout":null,"metadata":null,"text":"Simple Chunking Strategies for RAG Applications (Part 1)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_1":{"__typename":"Paragraph","id":"afec9737778e_1","name":"82c9","type":"P","href":null,"layout":null,"metadata":null,"text":"When building a RAG (Retrieval-Augmented Generation) system, chunking text into manageable segments is a crucial step. Chunking not only ensures that content is well-organized but also improves the relevance and efficiency of search results. While many chunking techniques exist, this post will focus on basic strategies implemented using Langchain and Llama-Index. This is the first part of a series where we will explore these strategies.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":61,"end":69,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_2":{"__typename":"Paragraph","id":"afec9737778e_2","name":"4a41","type":"H4","href":null,"layout":null,"metadata":null,"text":"Chunking Considerations","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_3":{"__typename":"Paragraph","id":"afec9737778e_3","name":"14a4","type":"P","href":null,"layout":null,"metadata":null,"text":"Before diving into the methods, it’s essential to consider:","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_4":{"__typename":"Paragraph","id":"afec9737778e_4","name":"e1a5","type":"P","href":null,"layout":null,"metadata":null,"text":"Chunk Size The size of each chunk should strike a balance between maintaining enough context for meaningful analysis and avoiding excessively large chunks that could affect focus. Smaller chunks (e.g., 256 to 512 tokens) are suited for detailed, granular tasks, whereas larger chunks may be better for understanding broader themes.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":10,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_5":{"__typename":"Paragraph","id":"afec9737778e_5","name":"185a","type":"P","href":null,"layout":null,"metadata":null,"text":"Chunk Overlap An overlap of 100–200 tokens is generally effective. This overlap helps maintain continuity and context between chunks, ensuring that segmentation does not disrupt the flow and coherence of the text.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":13,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_6":{"__typename":"Paragraph","id":"afec9737778e_6","name":"03b8","type":"P","href":null,"layout":null,"metadata":null,"text":"Model Compatibility The chunk size should align with the processing capabilities of the underlying language models. Some models handle larger chunks effectively, while others might be optimized for shorter chunks, suitable for sentence-level embeddings. Ensure that your chunk size is compatible with the model’s requirements to optimize performance.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":19,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_7":{"__typename":"Paragraph","id":"afec9737778e_7","name":"c11a","type":"P","href":null,"layout":null,"metadata":null,"text":"Task Specificity: The nature of your task significantly impacts the optimal chunking strategy. For tasks involving precise information retrieval, smaller, more focused chunks can enhance retrieval accuracy. Conversely, tasks requiring complex reasoning or broader context might benefit from larger chunks that capture more comprehensive information.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":17,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_8":{"__typename":"Paragraph","id":"afec9737778e_8","name":"a7b6","type":"P","href":null,"layout":null,"metadata":null,"text":"System Constraints: If the chunked content needs to be processed by another system with token limitations or other constraints, you must adjust chunk sizes to fit within those boundaries. Ensure that the chunks do not exceed the maximum token limits of any integrated systems or APIs to avoid processing issues.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":19,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_9":{"__typename":"Paragraph","id":"afec9737778e_9","name":"14d9","type":"H4","href":null,"layout":null,"metadata":null,"text":"The Problem We Want to Solve","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_10":{"__typename":"Paragraph","id":"afec9737778e_10","name":"a8fe","type":"P","href":null,"layout":null,"metadata":null,"text":"In a RAG application, retrieving relevant information from a vast amount of data efficiently is paramount. Incorrect chunking can result in either losing important context or including too much noise, leading to poor search results. The goal is to find a chunking strategy that balances precision with context retention, optimizing both the embedding process and retrieval quality.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_11":{"__typename":"Paragraph","id":"afec9737778e_11","name":"02da","type":"H3","href":null,"layout":null,"metadata":null,"text":"Simple chunking methods","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_12":{"__typename":"Paragraph","id":"afec9737778e_12","name":"4faf","type":"H4","href":null,"layout":null,"metadata":null,"text":"1 \u002F Character Splitting","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_13":{"__typename":"Paragraph","id":"afec9737778e_13","name":"623d","type":"P","href":null,"layout":null,"metadata":null,"text":"This method involves splitting text at fixed character intervals, possibly with some overlap, to ensure context is maintained across chunks. This is a simple yet effective approach for uniformly structured content.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_14":{"__typename":"Paragraph","id":"afec9737778e_14","name":"2a9a","type":"P","href":null,"layout":null,"metadata":null,"text":"Using Langchain","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":15,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_15":{"__typename":"Paragraph","id":"afec9737778e_15","name":"0634","type":"PRE","href":null,"layout":null,"metadata":null,"text":"text = \"\"\"\nCharacter splitting is the most basic form of splitting up your text.\nIt is the process of simply dividing your text into N-character sized chunks regardless of their content or form.\n\"\"\"\n\nfrom langchain.text_splitter import CharacterTextSplitter\ntext_splitter = CharacterTextSplitter(chunk_size = 35, chunk_overlap=10, separator=' ', strip_whitespace=False)\ndocuments = text_splitter.create_documents([text])\nprint(documents)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_16":{"__typename":"Paragraph","id":"afec9737778e_16","name":"9c01","type":"H4","href":null,"layout":null,"metadata":null,"text":"2 \u002F Recursive Character Text Splitting","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_17":{"__typename":"Paragraph","id":"afec9737778e_17","name":"3fbc","type":"P","href":null,"layout":null,"metadata":null,"text":"Recursive chunking breaks down text hierarchically, using different separators, to create contextually relevant chunks.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_18":{"__typename":"Paragraph","id":"afec9737778e_18","name":"3c84","type":"P","href":null,"layout":null,"metadata":null,"text":"Using Langchain","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":15,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_19":{"__typename":"Paragraph","id":"afec9737778e_19","name":"3cc3","type":"PRE","href":null,"layout":null,"metadata":null,"text":"text = \"\"\"\nThis text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ['\\n\\n', '\\n', ' ', '']. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.\n\"\"\"\n\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\n\ntext_splitter = RecursiveCharacterTextSplitter(chunk_size = 450,\n chunk_overlap=50)\ndocuments = text_splitter.create_documents([text])\nprint(documents)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_20":{"__typename":"Paragraph","id":"afec9737778e_20","name":"199e","type":"P","href":null,"layout":null,"metadata":null,"text":"Using Llama-index","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":17,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_21":{"__typename":"Paragraph","id":"afec9737778e_21","name":"50db","type":"PRE","href":null,"layout":null,"metadata":null,"text":"from langchain.text_splitter import RecursiveCharacterTextSplitter\nfrom llama_index.core.node_parser import LangchainNodeParser\nfrom llama_index.core import SimpleDirectoryReader\n\nreader = SimpleDirectoryReader(\"data\")\ndocuments = reader.load_data()\n\nparser = LangchainNodeParser(RecursiveCharacterTextSplitter())\nnodes = parser.get_nodes_from_documents(documents)\nprint(nodes)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_22":{"__typename":"Paragraph","id":"afec9737778e_22","name":"940d","type":"H4","href":null,"layout":null,"metadata":null,"text":"3 \u002F Sentence Splitting","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_23":{"__typename":"Paragraph","id":"afec9737778e_23","name":"5e3e","type":"P","href":null,"layout":null,"metadata":null,"text":"A Sentence Splitter breaks text into individual sentences, facilitating more precise text analysis and processing.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_24":{"__typename":"Paragraph","id":"afec9737778e_24","name":"7792","type":"P","href":null,"layout":null,"metadata":null,"text":"Using Llama-index","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":17,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_25":{"__typename":"Paragraph","id":"afec9737778e_25","name":"3ab1","type":"PRE","href":null,"layout":null,"metadata":null,"text":"# text = \"\"\"\n# This tool enhances tasks like information retrieval and text generation by treating each sentence as a distinct unit, ensuring context is maintained and understood correctly.\n# \"\"\"\n# with open('data\u002Ftext.txt', 'w') as f:\n# f.write(text)\n\nfrom llama_index.core.node_parser import SentenceSplitter\nfrom llama_index.core import SimpleDirectoryReader\n\nreader = SimpleDirectoryReader(\"data\")\ndocuments = reader.load_data()\n\nsplitter = SentenceSplitter(\n chunk_size=1024,\n chunk_overlap=20,\n)\nnodes = splitter.get_nodes_from_documents(documents)\nprint(nodes)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_26":{"__typename":"Paragraph","id":"afec9737778e_26","name":"db1a","type":"H4","href":null,"layout":null,"metadata":null,"text":"4 \u002F Document Specific Splitting","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_27":{"__typename":"Paragraph","id":"afec9737778e_27","name":"668a","type":"P","href":null,"layout":null,"metadata":null,"text":"A “structure-aware” chunker divides text based on its inherent structure, such as headings, lists, or sections, to preserve the content’s logical organization. This method ensures that chunks retain their meaningful context and coherence, which is particularly useful for structured documents like reports or manuals.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_28":{"__typename":"Paragraph","id":"afec9737778e_28","name":"29e8","type":"P","href":null,"layout":null,"metadata":null,"text":"Using Langchain","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":15,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_29":{"__typename":"Paragraph","id":"afec9737778e_29","name":"011f","type":"P","href":null,"layout":null,"metadata":null,"text":"MarkdownTextSplitter :","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_30":{"__typename":"Paragraph","id":"afec9737778e_30","name":"683a","type":"PRE","href":null,"layout":null,"metadata":null,"text":"with open(\"README.md\") as f:\n markdown_text = f.read()\n\nfrom langchain.text_splitter import MarkdownTextSplitter\n\nsplitter = MarkdownTextSplitter(chunk_size = 40, \n chunk_overlap=0)\n\ndocuments = splitter.create_documents([markdown_text])\n\nprint(documents)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_31":{"__typename":"Paragraph","id":"afec9737778e_31","name":"a0a6","type":"P","href":null,"layout":null,"metadata":null,"text":"Python code Splitter :","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_32":{"__typename":"Paragraph","id":"afec9737778e_32","name":"f66c","type":"PRE","href":null,"layout":null,"metadata":null,"text":"python_text = \"\"\"\nimport numpy as np\ndef mean_squared_error(y_true, y_pred):\n # Convert inputs to numpy arrays\n y_true = np.array(y_true)\n y_pred = np.array(y_pred)\n\n # Compute the squared differences\n squared_differences = (y_true - y_pred) ** 2\n\n # Compute the mean of the squared differences\n mse = np.mean(squared_differences)\n\n return mse\ny_true = [3, -0.5, 2, 7]\ny_pred = [2.5, 0.0, 2, 8]\n\nmse = mean_squared_error(y_true, y_pred)\nprint(f\"Mean Squared Error: {mse}\")\n\"\"\"\n\n\nfrom langchain.text_splitter import PythonCodeTextSplitter\npython_splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=0)\ndocuments = python_splitter.create_documents([python_text])\n\nprint(documents)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_33":{"__typename":"Paragraph","id":"afec9737778e_33","name":"7e45","type":"P","href":null,"layout":null,"metadata":null,"text":"javascript code Splitter :","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_34":{"__typename":"Paragraph","id":"afec9737778e_34","name":"0a01","type":"PRE","href":null,"layout":null,"metadata":null,"text":"from langchain.text_splitter import RecursiveCharacterTextSplitter, Language\njavascript_text = \"\"\"\nfunction dotProduct(vectorA, vectorB) {\n if (vectorA.length !== vectorB.length) {\n throw new Error('Vectors must be of the same length');\n }\n\n return vectorA.reduce((sum, currentValue, index) =\u003E {\n return sum + currentValue * vectorB[index];\n }, 0);\n}\n\n\u002F\u002F Example usage:\nconst vectorA = [1, 2, 3];\nconst vectorB = [4, 5, 6];\n\nconst result = dotProduct(vectorA, vectorB);\nconsole.log(`Dot Product: ${result}`);\n\"\"\"\njs_splitter = RecursiveCharacterTextSplitter.from_language(\n language=Language.JS, chunk_size=65, chunk_overlap=0\n)\ndocuments = js_splitter.create_documents([javascript_text])\n\nprint(documents)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_35":{"__typename":"Paragraph","id":"afec9737778e_35","name":"42ae","type":"P","href":null,"layout":null,"metadata":null,"text":"Using Llama-index","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":17,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_36":{"__typename":"Paragraph","id":"afec9737778e_36","name":"700f","type":"P","href":null,"layout":null,"metadata":null,"text":"MarkdownNodeParser :","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_37":{"__typename":"Paragraph","id":"afec9737778e_37","name":"2172","type":"PRE","href":null,"layout":null,"metadata":null,"text":"from llama_index.core.node_parser import MarkdownNodeParser\nfrom llama_index.core import SimpleDirectoryReader\n\nreader = SimpleDirectoryReader(input_dir=\"data\" ,\n required_exts=[\".md\"])\n\nmarkdown_docs = reader.load_data()\n\nparser = MarkdownNodeParser()\n\nnodes = parser.get_nodes_from_documents(markdown_docs)\n\n\nprint(nodes)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_38":{"__typename":"Paragraph","id":"afec9737778e_38","name":"902d","type":"P","href":null,"layout":null,"metadata":null,"text":"JSONNodeParser :","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_39":{"__typename":"Paragraph","id":"afec9737778e_39","name":"7b6d","type":"PRE","href":null,"layout":null,"metadata":null,"text":"from llama_index.core.node_parser import JSONNodeParser\nfrom llama_index.core import SimpleDirectoryReader\n\nparser = JSONNodeParser()\n\nreader = SimpleDirectoryReader(input_dir=\"data\" ,\n required_exts=[\".json\"])\n\njson_docs = reader.load_data()\nnodes = parser.get_nodes_from_documents(json_docs)\nprint(nodes)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"EXPLICIT","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_40":{"__typename":"Paragraph","id":"afec9737778e_40","name":"c950","type":"P","href":null,"layout":null,"metadata":null,"text":"HTMLNodeParser :","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_41":{"__typename":"Paragraph","id":"afec9737778e_41","name":"760c","type":"PRE","href":null,"layout":null,"metadata":null,"text":"from llama_index.core.node_parser import HTMLNodeParser\n\ntags = [\"p\", \"h1\", \"h2\", \"h3\", \"h4\", \"h5\", \"h6\", \"li\", \"b\", \"i\", \"u\", \"section\"]\n\nparser = HTMLNodeParser(tags=tags)\nnodes = parser.get_nodes_from_documents(html_docs)\nprint(nodes)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"makefile"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_42":{"__typename":"Paragraph","id":"afec9737778e_42","name":"e675","type":"H4","href":null,"layout":null,"metadata":null,"text":"5 \u002F Semantic Chunking","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_43":{"__typename":"Paragraph","id":"afec9737778e_43","name":"5c7b","type":"P","href":null,"layout":null,"metadata":null,"text":"Semantic chunking aims to group text into chunks based on semantic meaning rather than fixed size or structure. This method uses embeddings to assess the similarity between chunks, ensuring that semantically similar content remains together.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_44":{"__typename":"Paragraph","id":"afec9737778e_44","name":"d715","type":"P","href":null,"layout":null,"metadata":null,"text":"Using Llama-index","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":17,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_45":{"__typename":"Paragraph","id":"afec9737778e_45","name":"5625","type":"P","href":null,"layout":null,"metadata":null,"text":"In LlamaIndex, the SemanticSplitterNodeParser class implements this by adaptively selecting breakpoints based on embedding similarity, with configurable parameters such as buffer_size (initial window size for chunks), breakpoint_percentile_threshold (split threshold), and embed_mode (embedding model used).","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"CODE","start":19,"end":45,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"CODE","start":172,"end":183,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"CODE","start":218,"end":249,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"CODE","start":273,"end":283,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"A","start":19,"end":45,"href":"https:\u002F\u002Fdocs.llamaindex.ai\u002Fen\u002Fstable\u002Fapi_reference\u002Fnode_parsers\u002Fsemantic_splitter","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_46":{"__typename":"Paragraph","id":"afec9737778e_46","name":"67c7","type":"PRE","href":null,"layout":null,"metadata":null,"text":"# pip install llama-index-embeddings-huggingface llama-index-embeddings-instructor\n\nfrom llama_index.embeddings.huggingface import HuggingFaceEmbedding\nembed_model = HuggingFaceEmbedding(model_name=\"BAAI\u002Fbge-small-en-v1.5\")\n\n\nfrom llama_index.core import SimpleDirectoryReader\nfrom llama_index.core.node_parser import (\n SentenceSplitter,\n SemanticSplitterNodeParser,\n)\nsplitter = SemanticSplitterNodeParser(\n buffer_size=1,\n breakpoint_percentile_threshold=95,\n embed_model=embed_model\n)\n\nbase_splitter = SentenceSplitter(chunk_size=512)\n\ndocuments = SimpleDirectoryReader(input_files=[\"text.txt\"]).load_data()\nnodes = splitter.get_nodes_from_documents(documents)\n\nprint(nodes)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_47":{"__typename":"Paragraph","id":"afec9737778e_47","name":"745c","type":"P","href":null,"layout":null,"metadata":null,"text":"Using Langchain","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"STRONG","start":0,"end":15,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_48":{"__typename":"Paragraph","id":"afec9737778e_48","name":"1f30","type":"P","href":null,"layout":null,"metadata":null,"text":"Similarly, Langchain’s SemanticChunker detects sentence boundaries by analyzing embedding differences; sentences are split when the difference exceeds a specified threshold, maintaining semantic coherence within chunks.","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"CODE","start":23,"end":38,"href":null,"anchorType":null,"userId":null,"linkMetadata":null},{"__typename":"Markup","type":"A","start":23,"end":38,"href":"https:\u002F\u002Fpython.langchain.com\u002Fv0.2\u002Fdocs\u002Fhow_to\u002Fsemantic-chunker","anchorType":"LINK","userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_49":{"__typename":"Paragraph","id":"afec9737778e_49","name":"d489","type":"PRE","href":null,"layout":null,"metadata":null,"text":"# pip install langchain_experimental fastembed langchain_community\n\nfrom langchain_community.embeddings.fastembed import FastEmbedEmbeddings\nembed_model = FastEmbedEmbeddings(model_name=\"BAAI\u002Fbge-base-en-v1.5\")\n\nfrom langchain_experimental.text_splitter import SemanticChunker\nfrom langchain.text_splitter import RecursiveCharacterTextSplitter\n\ntext_splitter = RecursiveCharacterTextSplitter(\n chunk_size=1000,\n chunk_overlap=0,\n length_function=len,\n is_separator_regex=False\n)\ndocuments = text_splitter.create_documents([text])\n\n\nsemantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type=\"percentile\")\nsemantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])\n\nprint(semantic_chunks)","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":{"__typename":"CodeBlockMetadata","mode":"AUTO","lang":"python"},"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_50":{"__typename":"Paragraph","id":"afec9737778e_50","name":"6306","type":"H4","href":null,"layout":null,"metadata":null,"text":"Choosing the Best Chunking Method","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_51":{"__typename":"Paragraph","id":"afec9737778e_51","name":"8535","type":"P","href":null,"layout":null,"metadata":null,"text":"Selecting the right chunking strategy depends on your application’s requirements and constraints. For simple, structured content, character splitting or recursive chunking may suffice. For more complex documents, document-specific or semantic chunking might be necessary to preserve context and meaning. Consider model compatibility, task specificity, and system constraints to ensure the optimal chunking method for your needs.","hasDropCap":null,"dropCapImage":null,"markups":[],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_52":{"__typename":"Paragraph","id":"afec9737778e_52","name":"5208","type":"P","href":null,"layout":null,"metadata":null,"text":"My LinkedIn : https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fayoub-kirouane3","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":14,"end":57,"href":"https:\u002F\u002Fwww.linkedin.com\u002Fin\u002Fayoub-kirouane3","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"STRONG","start":0,"end":11,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"Paragraph:afec9737778e_53":{"__typename":"Paragraph","id":"afec9737778e_53","name":"54f9","type":"P","href":null,"layout":null,"metadata":null,"text":"My HuggingFace : https:\u002F\u002Fhuggingface.co\u002Fayoubkirouane","hasDropCap":null,"dropCapImage":null,"markups":[{"__typename":"Markup","type":"A","start":17,"end":53,"href":"https:\u002F\u002Fhuggingface.co\u002Fayoubkirouane","anchorType":"LINK","userId":null,"linkMetadata":null},{"__typename":"Markup","type":"STRONG","start":0,"end":14,"href":null,"anchorType":null,"userId":null,"linkMetadata":null}],"codeBlockMetadata":null,"iframe":null,"mixtapeMetadata":null},"ImageMetadata:":{"__typename":"ImageMetadata","id":"","alt":null,"focusPercentX":null,"focusPercentY":null,"originalHeight":null,"originalWidth":null},"PostViewerEdge:postId:d56903b167c5-viewerId:lo_ea78ee978563":{"__typename":"PostViewerEdge","shouldIndexPostForExternalSearch":true,"id":"postId:d56903b167c5-viewerId:lo_ea78ee978563"},"Tag:retrieval-augmented-gen":{"__typename":"Tag","id":"retrieval-augmented-gen","displayTitle":"Retrieval Augmented Gen","normalizedTagSlug":"retrieval-augmented-gen"},"Tag:rag-system":{"__typename":"Tag","id":"rag-system","displayTitle":"Rag System","normalizedTagSlug":"rag-system"},"Tag:text-chunking":{"__typename":"Tag","id":"text-chunking","displayTitle":"Text Chunking","normalizedTagSlug":"text-chunking"},"Tag:large-language-models":{"__typename":"Tag","id":"large-language-models","displayTitle":"Large Language Models","normalizedTagSlug":"large-language-models"},"Tag:genai":{"__typename":"Tag","id":"genai","displayTitle":"Genai","normalizedTagSlug":"genai"},"Post:d56903b167c5":{"__typename":"Post","id":"d56903b167c5","collection":null,"content({\"postMeteringOptions\":{}})":{"__typename":"PostContent","isLockedPreviewOnly":false,"bodyModel":{"__typename":"RichText","sections":[{"__typename":"Section","name":"5d0d","startIndex":0,"textLayout":null,"imageLayout":null,"backgroundImage":null,"videoLayout":null,"backgroundVideo":null},{"__typename":"Section","name":"92d9","startIndex":26,"textLayout":null,"imageLayout":null,"backgroundImage":null,"videoLayout":null,"backgroundVideo":null},{"__typename":"Section","name":"c0c4","startIndex":42,"textLayout":null,"imageLayout":null,"backgroundImage":null,"videoLayout":null,"backgroundVideo":null},{"__typename":"Section","name":"7e81","startIndex":52,"textLayout":null,"imageLayout":null,"backgroundImage":null,"videoLayout":null,"backgroundVideo":null}],"paragraphs":[{"__ref":"Paragraph:afec9737778e_0"},{"__ref":"Paragraph:afec9737778e_1"},{"__ref":"Paragraph:afec9737778e_2"},{"__ref":"Paragraph:afec9737778e_3"},{"__ref":"Paragraph:afec9737778e_4"},{"__ref":"Paragraph:afec9737778e_5"},{"__ref":"Paragraph:afec9737778e_6"},{"__ref":"Paragraph:afec9737778e_7"},{"__ref":"Paragraph:afec9737778e_8"},{"__ref":"Paragraph:afec9737778e_9"},{"__ref":"Paragraph:afec9737778e_10"},{"__ref":"Paragraph:afec9737778e_11"},{"__ref":"Paragraph:afec9737778e_12"},{"__ref":"Paragraph:afec9737778e_13"},{"__ref":"Paragraph:afec9737778e_14"},{"__ref":"Paragraph:afec9737778e_15"},{"__ref":"Paragraph:afec9737778e_16"},{"__ref":"Paragraph:afec9737778e_17"},{"__ref":"Paragraph:afec9737778e_18"},{"__ref":"Paragraph:afec9737778e_19"},{"__ref":"Paragraph:afec9737778e_20"},{"__ref":"Paragraph:afec9737778e_21"},{"__ref":"Paragraph:afec9737778e_22"},{"__ref":"Paragraph:afec9737778e_23"},{"__ref":"Paragraph:afec9737778e_24"},{"__ref":"Paragraph:afec9737778e_25"},{"__ref":"Paragraph:afec9737778e_26"},{"__ref":"Paragraph:afec9737778e_27"},{"__ref":"Paragraph:afec9737778e_28"},{"__ref":"Paragraph:afec9737778e_29"},{"__ref":"Paragraph:afec9737778e_30"},{"__ref":"Paragraph:afec9737778e_31"},{"__ref":"Paragraph:afec9737778e_32"},{"__ref":"Paragraph:afec9737778e_33"},{"__ref":"Paragraph:afec9737778e_34"},{"__ref":"Paragraph:afec9737778e_35"},{"__ref":"Paragraph:afec9737778e_36"},{"__ref":"Paragraph:afec9737778e_37"},{"__ref":"Paragraph:afec9737778e_38"},{"__ref":"Paragraph:afec9737778e_39"},{"__ref":"Paragraph:afec9737778e_40"},{"__ref":"Paragraph:afec9737778e_41"},{"__ref":"Paragraph:afec9737778e_42"},{"__ref":"Paragraph:afec9737778e_43"},{"__ref":"Paragraph:afec9737778e_44"},{"__ref":"Paragraph:afec9737778e_45"},{"__ref":"Paragraph:afec9737778e_46"},{"__ref":"Paragraph:afec9737778e_47"},{"__ref":"Paragraph:afec9737778e_48"},{"__ref":"Paragraph:afec9737778e_49"},{"__ref":"Paragraph:afec9737778e_50"},{"__ref":"Paragraph:afec9737778e_51"},{"__ref":"Paragraph:afec9737778e_52"},{"__ref":"Paragraph:afec9737778e_53"}]},"validatedShareKey":"","shareKeyCreator":null},"creator":{"__ref":"User:4751fd7878c5"},"inResponseToEntityType":null,"isLocked":false,"isMarkedPaywallOnly":false,"lockedSource":"LOCKED_POST_SOURCE_NONE","mediumUrl":"https:\u002F\u002Fmedium.com\u002F@ayoubkirouane3\u002Fsimple-chunking-strategies-for-rag-applications-part-1-d56903b167c5","primaryTopic":null,"topics":[{"__typename":"Topic","slug":"programming"}],"isPublished":true,"latestPublishedVersion":"afec9737778e","visibility":"PUBLIC","postResponses":{"__typename":"PostResponses","count":0},"clapCount":94,"allowResponses":true,"isLimitedState":false,"title":"Simple Chunking Strategies for RAG Applications (Part 1)","isSeries":false,"sequence":null,"uniqueSlug":"simple-chunking-strategies-for-rag-applications-part-1-d56903b167c5","socialTitle":"","socialDek":"","canonicalUrl":"","metaDescription":"","latestPublishedAt":1724754238071,"readingTime":4.860377358490566,"previewContent":{"__typename":"PreviewContent","subtitle":"When building a RAG (Retrieval-Augmented Generation) system, chunking text into manageable segments is a crucial step. Chunking not only…"},"previewImage":{"__ref":"ImageMetadata:"},"isShortform":false,"seoTitle":"","firstPublishedAt":1724754238071,"updatedAt":1731862203528,"shortformType":"SHORTFORM_TYPE_LINK","seoDescription":"","viewerEdge":{"__ref":"PostViewerEdge:postId:d56903b167c5-viewerId:lo_ea78ee978563"},"isSuspended":false,"license":"ALL_RIGHTS_RESERVED","tags":[{"__ref":"Tag:retrieval-augmented-gen"},{"__ref":"Tag:rag-system"},{"__ref":"Tag:text-chunking"},{"__ref":"Tag:large-language-models"},{"__ref":"Tag:genai"}],"isNewsletter":false,"statusForCollection":null,"pendingCollection":null,"detectedLanguage":"en","wordCount":1288,"layerCake":0,"responsesLocked":false}}</script><script>window.__MIDDLEWARE_STATE__={"session":{"xsrf":""},"cache":{"cacheStatus":"EXPIRED"}}</script><script src="https://cdn-client.medium.com/lite/static/js/manifest.aa9242f7.js"></script><script src="https://cdn-client.medium.com/lite/static/js/9865.1496d74a.js"></script><script src="https://cdn-client.medium.com/lite/static/js/main.e556b4ac.js"></script><script src="https://cdn-client.medium.com/lite/static/js/instrumentation.d9108df7.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/reporting.ff22a7a5.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5049.d1ead72d.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/4810.6318add7.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6618.db187378.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2707.b0942613.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/9977.5b3eb23a.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8599.1ab63137.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5250.9f9e01d2.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5787.e66a3a4d.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2648.26563adf.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8393.826a25fb.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/3104.c3413b66.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/3735.afb7e926.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/5642.8ad8a900.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6546.cd03f950.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/6834.08de95de.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7346.72622eb9.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2420.2a5e2d95.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/839.ca7937c2.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7975.d195c6f1.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2106.21ff89d3.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/7394.094844de.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2961.00a48598.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8204.c4082863.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/4391.59acaed3.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/PostPage.MainContent.1387c5dc.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/8414.6565ad5f.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/3974.8d3e0217.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/2527.a0afad8a.chunk.js"></script> <script src="https://cdn-client.medium.com/lite/static/js/PostResponsesContent.36c2ecf4.chunk.js"></script><script>window.main();</script><script>(function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'8e9552248b68819e',t:'MTczMjc0NDA5MS4wMDAwMDA='};var a=document.createElement('script');a.nonce='';a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();</script></body></html>