CINXE.COM

<!DOCTYPE html> <html lang="en"><head><meta charset="utf-8"><link rel=canonical href='https://https://www.regular-expressions.info//wordboundaries.html'><title>Regex Tutorial - \b Word Boundaries</title> <meta name="viewport" content="width=device-width, initial-scale=1"> <meta name="author" content="Jan Goyvaerts"> <meta name="description" content="In regular expressions, \b anchors the regex at a word boundary or the position between a word and a non-word character, or vice versa."> <meta name="keywords" content=""> <link rel=stylesheet href="regex.css" type="text/css"><script src="theme.js" type="text/javascript"></script><link rel="alternate" type="application/rss+xml" title="New at Regular-Expressions.info" href="updates.xml"> </head> <body bgcolor=white text=black> <div id=top></div> <div id=btntop><div id=btngrid><a href="quickstart.html" target="_top"><div>Quick Start</div></a><a href="tutorial.html" target="_top"><div>Tutorial</div></a><a href="tools.html" target="_top"><div>Tools & Languages</div></a><a href="examples.html" target="_top"><div>Examples</div></a><a href="refflavors.html" target="_top"><div>Reference</div></a><a href="books.html" target="_top"><div>Book Reviews</div></a></div></div> <div id=contents><div id=side> <TABLE CLASS=side CELLSPACING=0 CELLPADDING=4><TR><TD CLASS=sideheader>Regex Tutorial</TD></TR><TR><TD><A HREF="tutorial.html" TARGET=_top>Introduction</A></TD></TR><TR><TD><A HREF="tutorialcnt.html" TARGET=_top>Table of Contents</A></TD></TR><TR><TD><A HREF="characters.html" TARGET=_top>Special Characters</A></TD></TR><TR><TD><A HREF="nonprint.html" TARGET=_top>Non-Printable Characters</A></TD></TR><TR><TD><A HREF="engine.html" TARGET=_top>Regex Engine Internals</A></TD></TR><TR><TD><A HREF="charclass.html" TARGET=_top>Character Classes</A></TD></TR><TR><TD><A HREF="charclasssubtract.html" TARGET=_top>Character Class Subtraction</A></TD></TR><TR><TD><A HREF="charclassintersect.html" TARGET=_top>Character Class Intersection</A></TD></TR><TR><TD><A HREF="shorthand.html" TARGET=_top>Shorthand Character Classes</A></TD></TR><TR><TD><A HREF="dot.html" TARGET=_top>Dot</A></TD></TR><TR><TD><A HREF="anchors.html" TARGET=_top>Anchors</A></TD></TR><TR><TD><A HREF="wordboundaries.html" TARGET=_top>Word Boundaries</A></TD></TR><TR><TD><A HREF="alternation.html" TARGET=_top>Alternation</A></TD></TR><TR><TD><A HREF="optional.html" TARGET=_top>Optional Items</A></TD></TR><TR><TD><A HREF="repeat.html" TARGET=_top>Repetition</A></TD></TR><TR><TD><A HREF="brackets.html" TARGET=_top>Grouping & Capturing</A></TD></TR><TR><TD><A HREF="backref.html" TARGET=_top>Backreferences</A></TD></TR><TR><TD><A HREF="backref2.html" TARGET=_top>Backreferences, part 2</A></TD></TR><TR><TD><A HREF="named.html" TARGET=_top>Named Groups</A></TD></TR><TR><TD><A HREF="backrefrel.html" TARGET=_top>Relative Backreferences</A></TD></TR><TR><TD><A HREF="branchreset.html" TARGET=_top>Branch Reset Groups</A></TD></TR><TR><TD><A HREF="freespacing.html" TARGET=_top>Free-Spacing & Comments</A></TD></TR><TR><TD><A HREF="unicode.html" TARGET=_top>Unicode</A></TD></TR><TR><TD><A HREF="modifiers.html" TARGET=_top>Mode Modifiers</A></TD></TR><TR><TD><A HREF="atomic.html" TARGET=_top>Atomic Grouping</A></TD></TR><TR><TD><A HREF="possessive.html" TARGET=_top>Possessive Quantifiers</A></TD></TR><TR><TD><A HREF="lookaround.html" TARGET=_top>Lookahead & Lookbehind</A></TD></TR><TR><TD><A HREF="lookaround2.html" TARGET=_top>Lookaround, part 2</A></TD></TR><TR><TD><A HREF="keep.html" TARGET=_top>Keep Text out of The Match</A></TD></TR><TR><TD><A HREF="conditional.html" TARGET=_top>Conditionals</A></TD></TR><TR><TD><A HREF="balancing.html" TARGET=_top>Balancing Groups</A></TD></TR><TR><TD><A HREF="recurse.html" TARGET=_top>Recursion</A></TD></TR><TR><TD><A HREF="subroutine.html" TARGET=_top>Subroutines</A></TD></TR><TR><TD><A HREF="recurseinfinite.html" TARGET=_top>Infinite Recursion</A></TD></TR><TR><TD><A HREF="recurserepeat.html" TARGET=_top>Recursion & Quantifiers</A></TD></TR><TR><TD><A HREF="recursecapture.html" TARGET=_top>Recursion & Capturing</A></TD></TR><TR><TD><A HREF="recursebackref.html" TARGET=_top>Recursion & Backreferences</A></TD></TR><TR><TD><A HREF="recursebacktrack.html" TARGET=_top>Recursion & Backtracking</A></TD></TR><TR><TD><A HREF="posixbrackets.html" TARGET=_top>POSIX Bracket Expressions</A></TD></TR><TR><TD><A HREF="zerolength.html" TARGET=_top>Zero-Length Matches</A></TD></TR><TR><TD><A HREF="continue.html" TARGET=_top>Continuing Matches</A></TD></TR> </TABLE><TABLE CLASS=side CELLSPACING=0 CELLPADDING=4><TR><TD CLASS=sideheader>More on This Site</TD></TR><TR><TD><A HREF="index.html" TARGET=_top>Introduction</A></TD></TR><TR><TD><A HREF="quickstart.html" TARGET=_top>Regular Expressions Quick Start</A></TD></TR><TR><TD><A HREF="tutorial.html" TARGET=_top>Regular Expressions Tutorial</A></TD></TR><TR><TD><A HREF="replacetutorial.html" TARGET=_top>Replacement Strings Tutorial</A></TD></TR><TR><TD><A HREF="tools.html" TARGET=_top>Applications and Languages</A></TD></TR><TR><TD><A HREF="examples.html" TARGET=_top>Regular Expressions Examples</A></TD></TR><TR><TD><A HREF="refflavors.html" TARGET=_top>Regular Expressions Reference</A></TD></TR><TR><TD><A HREF="refreplace.html" TARGET=_top>Replacement Strings Reference</A></TD></TR><TR><TD><A HREF="books.html" TARGET=_top>Book Reviews</A></TD></TR><TR><TD><A HREF="print.html" TARGET=_top>Printable PDF</A></TD></TR><TR><TD><A HREF="about.html" TARGET=_top>About This Site</A></TD></TR><TR><TD><A HREF="updates.html" TARGET=_top>RSS Feed & Blog</A></TD></TR></TABLE></DIV><div class=bodytext><div class=topad style="height:130px"><A HREF="https://www.regexbuddy.com/create.html" TARGET="_top"><picture><source media="(max-width: 370px)" srcset="ads/320/rxbtutorial100.png 1x, ads/320/rxbtutorial150.png 1.5x, ads/320/rxbtutorial200.png 2x, ads/320/rxbtutorial250.png 2.5x, ads/320/rxbtutorial300.png 3x, ads/320/rxbtutorial350.png 3.5x, ads/320/rxbtutorial400.png 4x"><source media="(max-width: 500px)" srcset="ads/360/rxbtutorial100.png 1x, ads/360/rxbtutorial150.png 1.5x, ads/360/rxbtutorial200.png 2x, ads/360/rxbtutorial250.png 2.5x, ads/360/rxbtutorial300.png 3x, ads/360/rxbtutorial350.png 3.5x, ads/360/rxbtutorial400.png 4x"><source media="(max-width: 660px)" srcset="ads/480/rxbtutorial100.png 1x, ads/480/rxbtutorial150.png 1.5x, ads/480/rxbtutorial200.png 2x, ads/480/rxbtutorial250.png 2.5x, ads/480/rxbtutorial300.png 3x, ads/480/rxbtutorial350.png 3.5x, ads/480/rxbtutorial400.png 4x"><source media="(max-width: 747px)" srcset="ads/640/rxbtutorial100.png 1x, ads/640/rxbtutorial150.png 1.5x, ads/640/rxbtutorial200.png 2x, ads/640/rxbtutorial250.png 2.5x, ads/640/rxbtutorial300.png 3x, ads/640/rxbtutorial350.png 3.5x, ads/640/rxbtutorial400.png 4x"><img src="ads/728/rxbtutorial100.png" srcset="ads/728/rxbtutorial100.png 1x, ads/728/rxbtutorial125.png 1.25x, ads/728/rxbtutorial150.png 1.5x, ads/728/rxbtutorial175.png 1.75x, ads/728/rxbtutorial200.png 2x, ads/728/rxbtutorial250.png 2.5x, ads/728/rxbtutorial300.png 3x, ads/728/rxbtutorial350.png 3.5x, ads/728/rxbtutorial400.png 4x" alt="RegexBuddy—Better than a regular expression tutorial!"></picture></A></div> <div class=bulb><h1>Word Boundaries</h1><script type="text/javascript">showbulb();</script></div> The metacharacter <TT CLASS=syntax>\b</TT> is an <A HREF="anchors.html" TARGET="_top">anchor</A> like the caret and the dollar sign. It matches at a position that is called a “word boundary”. This match is zero-length. There are three different positions that qualify as word boundaries: <ul> <li>Before the first character in the string, if the first character is a word character.</li> <li>After the last character in the string, if the last character is a word character.</li> <li>Between two characters in the string, where one is a word character and the other is not a word character.</li> </ul> Simply put: <TT CLASS=syntax>\b</TT> allows you to perform a “whole words only” search using a regular expression in the form of <TT CLASS=syntax>\bword\b</TT>. A “word character” is a character that can be used to form words. All characters that are not “word characters” are “non-word characters”. Exactly which characters are word characters depends on the regex flavor you’re working with. In most flavors, characters that are matched by the <A HREF="shorthand.html" TARGET="_top">short-hand character class</A> <TT CLASS=syntax>\w</TT> are the characters that are treated as word characters by word boundaries. <A HREF="java.html" TARGET="_top">Java</A> is an exception. Java supports Unicode for <TT CLASS=syntax>\b</TT> but not for <TT CLASS=syntax>\w</TT>. Most flavors, except the ones discussed below, have only one metacharacter that matches both before a word and after a word. This is because any position between characters can never be both at the start and at the end of a word. Using only one operator makes things easier for you. Since digits are considered to be word characters, <TT CLASS=syntax>\b4\b</TT> can be used to match a 4 that is not part of a larger number. This regex does not match <tt class=string>44 sheets of a4</tt>. So saying “<TT CLASS=syntax>\b</TT> matches before and after an alphanumeric sequence” is more exact than saying “before and after a word”. <TT CLASS=syntax>\B</TT> is the negated version of <TT CLASS=syntax>\b</TT>. <TT CLASS=syntax>\B</TT> matches at every position where <TT CLASS=syntax>\b</TT> does not. Effectively, <TT CLASS=syntax>\B</TT> matches at any position between two word characters as well as at any position between two non-word characters. <h2>Looking Inside The Regex Engine</h2> Let’s see what happens when we apply the regex <TT CLASS=syntax>\bis\b</TT> to the string <tt class=string>This island is beautiful</tt>. The engine starts with the first token <TT CLASS=syntax>\b</TT> at the first character <tt class=string>T</tt>. Since this token is zero-length, the position before the character is inspected. <TT CLASS=syntax>\b</TT> matches here, because the T is a word character and the character before it is the void before the start of the string. The engine continues with the next token: the literal <TT CLASS=syntax>i</TT>. The engine does not advance to the next character in the string, because the previous regex token was zero-length. <TT CLASS=syntax>i</TT> does not match <tt class=string>T</tt>, so the engine retries the first token at the next character position. <TT CLASS=syntax>\b</TT> cannot match at the position between the <tt class=string>T</tt> and the <tt class=string>h</tt>. It cannot match between the <tt class=string>h</tt> and the <tt class=string>i</tt> either, and neither between the <tt class=string>i</tt> and the <tt class=string>s</tt>. The next character in the string is a space. <TT CLASS=syntax>\b</TT> matches here because the space is not a word character, and the preceding character is. Again, the engine continues with the <TT CLASS=syntax>i</TT> which does not match with the space. Advancing a character and restarting with the first regex token, <TT CLASS=syntax>\b</TT> matches between the space and the second <tt class=string>i</tt> in the string. Continuing, the regex engine finds that <TT CLASS=syntax>i</TT> matches <tt class=match>i</tt> and <TT CLASS=syntax>s</TT> matches <tt class=match>s</tt>. Now, the engine tries to match the second <TT CLASS=syntax>\b</TT> at the position before the <tt class=string>l</tt>. This fails because this position is between two word characters. The engine reverts to the start of the regex and advances one character to the <tt class=string>s</tt> in <tt class=string>island</tt>. Again, the <TT CLASS=syntax>\b</TT> fails to match and continues to do so until the second space is reached. It matches there, but matching the <TT CLASS=syntax>i</TT> fails. But <TT CLASS=syntax>\b</TT> matches at the position before the third <tt class=string>i</tt> in the string. The engine continues, and finds that <TT CLASS=syntax>i</TT> matches <tt class=match>i</tt> and <TT CLASS=syntax>s</TT> matches <tt class=match>s</tt>. The last token in the regex, <TT CLASS=syntax>\b</TT>, also matches at the position before the third space in the string because the space is not a word character, and the character before it is. The engine has successfully matched the word <tt class=match>is</tt> in our string, skipping the two earlier occurrences of the characters i and s. If we had used the regular expression <TT CLASS=syntax>is</TT>, it would have matched the <tt class=match>is</tt> in <tt class=string>This</tt>. <a name="tcl"></a><h2>Tcl Word Boundaries</h2> Word boundaries, as described above, are supported by most regular expression flavors. Notable exceptions are the <A HREF="posix.html" TARGET="_top">POSIX</A> and <A HREF="xml.html" TARGET="_top">XML Schema</A> flavors, which don’t support word boundaries at all. <A HREF="tcl.html" TARGET="_top">Tcl</A> uses a different syntax. In Tcl, <TT CLASS=syntax>\b</TT> matches a backspace character, just like <TT CLASS=syntax>\x08</TT> in most regex flavors (including Tcl’s). <TT CLASS=syntax>\B</TT> matches a single backslash character in Tcl, just like <TT CLASS=syntax>\\</TT> in all other regex flavors (and Tcl too). Tcl uses the letter “y” instead of the letter “b” to match word boundaries. <TT CLASS=syntax>\y</TT> matches at any word boundary position, while <TT CLASS=syntax>\Y</TT> matches at any position that is not a word boundary. These Tcl regex tokens match exactly the same as <TT CLASS=syntax>\b</TT> and <TT CLASS=syntax>\B</TT> in Perl-style regex flavors. They don’t discriminate between the start and the end of a word. Tcl has two more word boundary tokens that do discriminate between the start and end of a word. <TT CLASS=syntax>\m</TT> matches only at the start of a word. That is, it matches at any position that has a non-word character to the left of it, and a word character to the right of it. It also matches at the start of the string if the first character in the string is a word character. <TT CLASS=syntax>\M</TT> matches only at the end of a word. It matches at any position that has a word character to the left of it, and a non-word character to the right of it. It also matches at the end of the string if the last character in the string is a word character. The only regex engine that supports Tcl-style word boundaries (besides Tcl itself) is the <A HREF="jgsoft.html" TARGET="_top">JGsoft engine</A>. In <A HREF="powergrep.html" TARGET="_top">PowerGREP</A> and <A HREF="editpadpro.html" TARGET="_top">EditPad Pro</A>, <TT CLASS=syntax>\b</TT> and <TT CLASS=syntax>\B</TT> are Perl-style word boundaries, while <TT CLASS=syntax>\y</TT>, <TT CLASS=syntax>\Y</TT>, <TT CLASS=syntax>\m</TT> and <TT CLASS=syntax>\M</TT> are Tcl-style word boundaries. In most situations, the lack of <TT CLASS=syntax>\m</TT> and <TT CLASS=syntax>\M</TT> tokens is not a problem. <TT CLASS=syntax>\yword\y</TT> finds “whole words only” occurrences of “word” just like <TT CLASS=syntax>\mword\M</TT> would. <TT CLASS=syntax>\Mword\m</TT> could never match anywhere, since <TT CLASS=syntax>\M</TT> never matches at a position followed by a word character, and <TT CLASS=syntax>\m</TT> never at a position preceded by one. If your regular expression needs to match characters before or after <TT CLASS=syntax>\y</TT>, you can easily specify in the regex whether these characters should be word characters or non-word characters. If you want to match any word, <TT CLASS=syntax>\y\w+\y</TT> gives the same result as <TT CLASS=syntax>\m.+\M</TT>. Using <TT CLASS=syntax>\w</TT> instead of the dot automatically restricts the first <TT CLASS=syntax>\y</TT> to the start of a word, and the second <TT CLASS=syntax>\y</TT> to the end of a word. Note that <TT CLASS=syntax>\y.+\y</TT> would not work. This regex matches each word, and also each sequence of non-word characters between the words in your subject string. That said, if your flavor supports <TT CLASS=syntax>\m</TT> and <TT CLASS=syntax>\M</TT>, the regex engine could apply <TT CLASS=syntax>\m\w+\M</TT> slightly faster than <TT CLASS=syntax>\y\w+\y</TT>, depending on its internal optimizations. If your regex flavor supports <A HREF="lookaround.html" TARGET="_top">lookahead and lookbehind</A>, you can use <TT CLASS=syntax>(?<!\w)(?=\w)</TT> to emulate Tcl’s <TT CLASS=syntax>\m</TT> and <TT CLASS=syntax>(?<=\w)(?!\w)</TT> to emulate <TT CLASS=syntax>\M</TT>. Though quite a bit more verbose, these lookaround constructs match exactly the same as Tcl’s word boundaries. If your flavor has lookahead but not lookbehind, and also has Perl-style word boundaries, you can use <TT CLASS=syntax>\b(?=\w)</TT> to emulate Tcl’s <TT CLASS=syntax>\m</TT> and <TT CLASS=syntax>\b(?!\w)</TT> to emulate <TT CLASS=syntax>\M</TT>. <TT CLASS=syntax>\b</TT> matches at the start or end of a word, and the lookahead checks if the next character is part of a word or not. If it is we’re at the start of a word. Otherwise, we’re at the end of a word. <a name="gnu"></a><h2>GNU Word Boundaries</h2> The <A HREF="gnu.html" TARGET="_top">GNU extensions</A> to POSIX regular expressions add support for the <TT CLASS=syntax>\b</TT> and <TT CLASS=syntax>\B</TT> word boundaries, as described above. GNU also uses its own syntax for start-of-word and end-of-word boundaries. <TT CLASS=syntax>\<</TT> matches at the start of a word, like Tcl’s <TT CLASS=syntax>\m</TT>. <TT CLASS=syntax>\></TT> matches at the end of a word, like Tcl’s <TT CLASS=syntax>\M</TT>. <A HREF="boost.html" TARGET="_top">Boost</A> also treats <TT CLASS=syntax>\<</TT> and <TT CLASS=syntax>\></TT> as word boundaries when using the ECMAScript, extended, egrep, or awk grammar. <a name="posix"></a><h2>POSIX Word Boundaries</h2> The <A HREF="posix.html" TARGET="_top">POSIX</A> standard defines <TT CLASS=syntax>[[:<:]]</TT> as a start-of-word boundary, and <TT CLASS=syntax>[[:>:]]</TT> as an end-of-word boundary. Though the syntax is borrowed from <A HREF="posixbrackets.html" TARGET="_top">POSIX bracket expressions</A>, these tokens are word boundaries that have nothing to do with and cannot be used inside character classes. Tcl and GNU also support POSIX word boundaries. <A HREF="pcre.html" TARGET="_top">PCRE</A> supports POSIX word boundaries starting with version 8.34. Boost supports them in all its grammars. <div id=cntmobi>|&ensp;<a href='quickstart.html'>Quick Start</a>&ensp;|&ensp;<a href='tutorial.html'>Tutorial</a>&ensp;|&ensp;<a href='tools.html'>Tools & Languages</a>&ensp;|&ensp;<a href='examples.html'>Examples</a>&ensp;|&ensp;<a href='refflavors.html'>Reference</a>&ensp;|&ensp;<a href='books.html'>Book Reviews</a>&ensp;||&ensp;<a href='tutorial.html'>Introduction</a>&ensp;|&ensp;<a href='tutorialcnt.html'>Table of Contents</a>&ensp;|&ensp;<a href='characters.html'>Special Characters</a>&ensp;|&ensp;<a href='nonprint.html'>Non-Printable Characters</a>&ensp;|&ensp;<a href='engine.html'>Regex Engine Internals</a>&ensp;|&ensp;<a href='charclass.html'>Character Classes</a>&ensp;|&ensp;<a href='charclasssubtract.html'>Character Class Subtraction</a>&ensp;|&ensp;<a href='charclassintersect.html'>Character Class Intersection</a>&ensp;|&ensp;<a href='shorthand.html'>Shorthand Character Classes</a>&ensp;|&ensp;<a href='dot.html'>Dot</a>&ensp;|&ensp;<a href='anchors.html'>Anchors</a>&ensp;|&ensp;<a href='wordboundaries.html'>Word Boundaries</a>&ensp;|&ensp;<a href='alternation.html'>Alternation</a>&ensp;|&ensp;<a href='optional.html'>Optional Items</a>&ensp;|&ensp;<a href='repeat.html'>Repetition</a>&ensp;|&ensp;<a href='brackets.html'>Grouping & Capturing</a>&ensp;|&ensp;<a href='backref.html'>Backreferences</a>&ensp;|&ensp;<a href='backref2.html'>Backreferences, part 2</a>&ensp;|&ensp;<a href='named.html'>Named Groups</a>&ensp;|&ensp;<a href='backrefrel.html'>Relative Backreferences</a>&ensp;|&ensp;<a href='branchreset.html'>Branch Reset Groups</a>&ensp;|&ensp;<a href='freespacing.html'>Free-Spacing & Comments</a>&ensp;|&ensp;<a href='unicode.html'>Unicode</a>&ensp;|&ensp;<a href='modifiers.html'>Mode Modifiers</a>&ensp;|&ensp;<a href='atomic.html'>Atomic Grouping</a>&ensp;|&ensp;<a href='possessive.html'>Possessive Quantifiers</a>&ensp;|&ensp;<a href='lookaround.html'>Lookahead & Lookbehind</a>&ensp;|&ensp;<a href='lookaround2.html'>Lookaround, part 2</a>&ensp;|&ensp;<a href='keep.html'>Keep Text out of The Match</a>&ensp;|&ensp;<a href='conditional.html'>Conditionals</a>&ensp;|&ensp;<a href='balancing.html'>Balancing Groups</a>&ensp;|&ensp;<a href='recurse.html'>Recursion</a>&ensp;|&ensp;<a href='subroutine.html'>Subroutines</a>&ensp;|&ensp;<a href='recurseinfinite.html'>Infinite Recursion</a>&ensp;|&ensp;<a href='recurserepeat.html'>Recursion & Quantifiers</a>&ensp;|&ensp;<a href='recursecapture.html'>Recursion & Capturing</a>&ensp;|&ensp;<a href='recursebackref.html'>Recursion & Backreferences</a>&ensp;|&ensp;<a href='recursebacktrack.html'>Recursion & Backtracking</a>&ensp;|&ensp;<a href='posixbrackets.html'>POSIX Bracket Expressions</a>&ensp;|&ensp;<a href='zerolength.html'>Zero-Length Matches</a>&ensp;|&ensp;<a href='continue.html'>Continuing Matches</a>&ensp;|</div> <div id=copyright> Page URL: <A HREF="https://www.regular-expressions.info/wordboundaries.html" TARGET="_top">https://www.regular-expressions.info/wordboundaries.html</A> Page last updated: 22 November 2019 Site last updated: 06 November 2024 Copyright © 2003-2024 Jan Goyvaerts. All rights reserved. </div> </div> </div> </body></html>