1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
|
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/search.R
\name{about_search_boundaries}
\alias{about_search_boundaries}
\alias{search_boundaries}
\alias{stringi-search-boundaries}
\title{Text Boundary Analysis in \pkg{stringi}}
\description{
Text boundary analysis is the process of locating linguistic boundaries
while formatting and handling text.
}
\details{
Examples of the boundary analysis process include:
\itemize{
\item Locating positions to word-wrap text to fit
within specific margins while displaying or printing,
see \code{\link{stri_wrap}} and \code{\link{stri_split_boundaries}}.
\item Counting characters, words, sentences, or paragraphs,
see \code{\link{stri_count_boundaries}}.
\item Making a list of the unique words in a document,
see \code{\link{stri_extract_all_words}} and then \code{\link{stri_unique}}.
\item Capitalizing the first letter of each word
or sentence, see also \code{\link{stri_trans_totitle}}.
\item Locating a particular unit of the text (for example,
finding the third word in the document),
see \code{\link{stri_locate_all_boundaries}}.
}
Generally, text boundary analysis is a locale-dependent operation.
For example, in Japanese and Chinese one does not separate words with spaces
- a line break can occur even in the middle of a word.
These languages have punctuation and diacritical
marks that cannot start or end a line, so this must also be taken into account.
\pkg{stringi} uses \pkg{ICU}'s \code{BreakIterator} to locate specific
text boundaries. Note that the \code{BreakIterator}'s behavior
may be controlled in come cases, see \code{\link{stri_opts_brkiter}}.
\itemize{
\item The \code{character} boundary iterator tries to match what a user
would think of as a ``character'' -- a basic unit of a writing system
for a language -- which may be more than just a single Unicode code point.
\item The \code{word} boundary iterator locates the boundaries
of words, for purposes such as ``Find whole words'' operations.
\item The \code{line_break} iterator locates positions that would
be appropriate to wrap lines when displaying the text.
\item The break iterator of type \code{sentence}
locates sentence boundaries.
}
For technical details on different classes of text boundaries refer
to the \pkg{ICU} User Guide, see below.
}
\references{
\emph{Boundary Analysis} -- ICU User Guide,
\url{https://unicode-org.github.io/icu/userguide/boundaryanalysis/}
}
\seealso{
The official online manual of \pkg{stringi} at \url{https://stringi.gagolewski.com/}
Gagolewski M., \pkg{stringi}: Fast and portable character string processing in R, \emph{Journal of Statistical Software} 103(2), 2022, 1-59, \doi{10.18637/jss.v103.i02}
Other locale_sensitive:
\code{\link{\%s<\%}()},
\code{\link{about_locale}},
\code{\link{about_search_coll}},
\code{\link{stri_compare}()},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_duplicated}()},
\code{\link{stri_enc_detect2}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_collator}()},
\code{\link{stri_order}()},
\code{\link{stri_rank}()},
\code{\link{stri_sort_key}()},
\code{\link{stri_sort}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_unique}()},
\code{\link{stri_wrap}()}
Other text_boundaries:
\code{\link{about_search}},
\code{\link{stri_count_boundaries}()},
\code{\link{stri_extract_all_boundaries}()},
\code{\link{stri_locate_all_boundaries}()},
\code{\link{stri_opts_brkiter}()},
\code{\link{stri_split_boundaries}()},
\code{\link{stri_split_lines}()},
\code{\link{stri_trans_tolower}()},
\code{\link{stri_wrap}()}
Other stringi_general_topics:
\code{\link{about_arguments}},
\code{\link{about_encoding}},
\code{\link{about_locale}},
\code{\link{about_search_charclass}},
\code{\link{about_search_coll}},
\code{\link{about_search_fixed}},
\code{\link{about_search_regex}},
\code{\link{about_search}},
\code{\link{about_stringi}}
}
\concept{locale_sensitive}
\concept{stringi_general_topics}
\concept{text_boundaries}
\author{
\href{https://www.gagolewski.com/}{Marek Gagolewski} and other contributors
}
|