[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 /* 2 YUI 3.17.2 (build 9c3c78e) 3 Copyright 2014 Yahoo! Inc. All rights reserved. 4 Licensed under the BSD License. 5 http://yuilibrary.com/license/ 6 */ 7 8 YUI.add('text-wordbreak', function (Y, NAME) { 9 10 /** 11 * Provides utility methods for splitting strings on word breaks and determining 12 * whether a character index represents a word boundary. 13 * 14 * @module text 15 * @submodule text-wordbreak 16 */ 17 18 /** 19 * <p> 20 * Provides utility methods for splitting strings on word breaks and determining 21 * whether a character index represents a word boundary, using the generic word 22 * breaking algorithm defined in the Unicode Text Segmentation guidelines 23 * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard 24 * Annex #29</a>). 25 * </p> 26 * 27 * <p> 28 * This algorithm provides a reasonable default for many languages. However, it 29 * does not cover language or context specific requirements, and it does not 30 * provide meaningful results at all for languages that don't use spaces between 31 * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based 32 * word breaking services usually provide significantly better results with 33 * better performance. 34 * </p> 35 * 36 * @class Text.WordBreak 37 * @static 38 */ 39 40 var Text = Y.Text, 41 WBData = Text.Data.WordBreak, 42 43 // Constants representing code point classifications. 44 ALETTER = 0, 45 MIDNUMLET = 1, 46 MIDLETTER = 2, 47 MIDNUM = 3, 48 NUMERIC = 4, 49 CR = 5, 50 LF = 6, 51 NEWLINE = 7, 52 EXTEND = 8, 53 FORMAT = 9, 54 KATAKANA = 10, 55 EXTENDNUMLET = 11, 56 OTHER = 12, 57 58 // RegExp objects generated from code point data. Each regex matches a single 59 // character against a set of Unicode code points. The index of each item in 60 // this array must match its corresponding code point constant value defined 61 // above. 62 SETS = [ 63 new RegExp(WBData.aletter), 64 new RegExp(WBData.midnumlet), 65 new RegExp(WBData.midletter), 66 new RegExp(WBData.midnum), 67 new RegExp(WBData.numeric), 68 new RegExp(WBData.cr), 69 new RegExp(WBData.lf), 70 new RegExp(WBData.newline), 71 new RegExp(WBData.extend), 72 new RegExp(WBData.format), 73 new RegExp(WBData.katakana), 74 new RegExp(WBData.extendnumlet) 75 ], 76 77 EMPTY_STRING = '', 78 PUNCTUATION = new RegExp('^' + WBData.punctuation + '$'), 79 WHITESPACE = /\s/, 80 81 WordBreak = { 82 // -- Public Static Methods ------------------------------------------------ 83 84 /** 85 * Splits the specified string into an array of individual words. 86 * 87 * @method getWords 88 * @param {String} string String to split. 89 * @param {Object} options (optional) Options object containing zero or more 90 * of the following properties: 91 * 92 * <dl> 93 * <dt>ignoreCase (Boolean)</dt> 94 * <dd> 95 * If <code>true</code>, the string will be converted to lowercase 96 * before being split. Default is <code>false</code>. 97 * </dd> 98 * 99 * <dt>includePunctuation (Boolean)</dt> 100 * <dd> 101 * If <code>true</code>, the returned array will include punctuation 102 * characters. Default is <code>false</code>. 103 * </dd> 104 * 105 * <dt>includeWhitespace (Boolean)</dt> 106 * <dd> 107 * If <code>true</code>, the returned array will include whitespace 108 * characters. Default is <code>false</code>. 109 * </dd> 110 * </dl> 111 * @return {Array} Array of words. 112 * @static 113 */ 114 getWords: function (string, options) { 115 var i = 0, 116 map = WordBreak._classify(string), 117 len = map.length, 118 word = [], 119 words = [], 120 chr, 121 includePunctuation, 122 includeWhitespace; 123 124 if (!options) { 125 options = {}; 126 } 127 128 if (options.ignoreCase) { 129 string = string.toLowerCase(); 130 } 131 132 includePunctuation = options.includePunctuation; 133 includeWhitespace = options.includeWhitespace; 134 135 // Loop through each character in the classification map and determine 136 // whether it precedes a word boundary, building an array of distinct 137 // words as we go. 138 for (; i < len; ++i) { 139 chr = string.charAt(i); 140 141 // Append this character to the current word. 142 word.push(chr); 143 144 // If there's a word boundary between the current character and the 145 // next character, append the current word to the words array and 146 // start building a new word. 147 if (WordBreak._isWordBoundary(map, i)) { 148 word = word.join(EMPTY_STRING); 149 150 if (word && 151 (includeWhitespace || !WHITESPACE.test(word)) && 152 (includePunctuation || !PUNCTUATION.test(word))) { 153 words.push(word); 154 } 155 156 word = []; 157 } 158 } 159 160 return words; 161 }, 162 163 /** 164 * Returns an array containing only unique words from the specified string. 165 * For example, the string <code>'foo bar baz foo'</code> would result in 166 * the array <code>['foo', 'bar', 'baz']</code>. 167 * 168 * @method getUniqueWords 169 * @param {String} string String to split. 170 * @param {Object} options (optional) Options (see <code>getWords()</code> 171 * for details). 172 * @return {Array} Array of unique words. 173 * @static 174 */ 175 getUniqueWords: function (string, options) { 176 return Y.Array.unique(WordBreak.getWords(string, options)); 177 }, 178 179 /** 180 * <p> 181 * Returns <code>true</code> if there is a word boundary between the 182 * specified character index and the next character index (or the end of the 183 * string). 184 * </p> 185 * 186 * <p> 187 * Note that there are always word breaks at the beginning and end of a 188 * string, so <code>isWordBoundary('', 0)</code> and 189 * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>. 190 * </p> 191 * 192 * @method isWordBoundary 193 * @param {String} string String to test. 194 * @param {Number} index Character index to test within the string. 195 * @return {Boolean} <code>true</code> for a word boundary, 196 * <code>false</code> otherwise. 197 * @static 198 */ 199 isWordBoundary: function (string, index) { 200 return WordBreak._isWordBoundary(WordBreak._classify(string), index); 201 }, 202 203 // -- Protected Static Methods --------------------------------------------- 204 205 /** 206 * Returns a character classification map for the specified string. 207 * 208 * @method _classify 209 * @param {String} string String to classify. 210 * @return {Array} Classification map. 211 * @protected 212 * @static 213 */ 214 _classify: function (string) { 215 var chr, 216 map = [], 217 i = 0, 218 j, 219 set, 220 stringLength = string.length, 221 setsLength = SETS.length, 222 type; 223 224 for (; i < stringLength; ++i) { 225 chr = string.charAt(i); 226 type = OTHER; 227 228 for (j = 0; j < setsLength; ++j) { 229 set = SETS[j]; 230 231 if (set && set.test(chr)) { 232 type = j; 233 break; 234 } 235 } 236 237 map.push(type); 238 } 239 240 return map; 241 }, 242 243 /** 244 * <p> 245 * Returns <code>true</code> if there is a word boundary between the 246 * specified character index and the next character index (or the end of the 247 * string). 248 * </p> 249 * 250 * <p> 251 * Note that there are always word breaks at the beginning and end of a 252 * string, so <code>_isWordBoundary('', 0)</code> and 253 * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>. 254 * </p> 255 * 256 * @method _isWordBoundary 257 * @param {Array} map Character classification map generated by 258 * <code>_classify</code>. 259 * @param {Number} index Character index to test. 260 * @return {Boolean} 261 * @protected 262 * @static 263 */ 264 _isWordBoundary: function (map, index) { 265 var prevType, 266 type = map[index], 267 nextType = map[index + 1], 268 nextNextType; 269 270 if (index < 0 || (index > map.length - 1 && index !== 0)) { 271 return false; 272 } 273 274 // WB5. Don't break between most letters. 275 if (type === ALETTER && nextType === ALETTER) { 276 return false; 277 } 278 279 nextNextType = map[index + 2]; 280 281 // WB6. Don't break letters across certain punctuation. 282 if (type === ALETTER && 283 (nextType === MIDLETTER || nextType === MIDNUMLET) && 284 nextNextType === ALETTER) { 285 return false; 286 } 287 288 prevType = map[index - 1]; 289 290 // WB7. Don't break letters across certain punctuation. 291 if ((type === MIDLETTER || type === MIDNUMLET) && 292 nextType === ALETTER && 293 prevType === ALETTER) { 294 return false; 295 } 296 297 // WB8/WB9/WB10. Don't break inside sequences of digits or digits 298 // adjacent to letters. 299 if ((type === NUMERIC || type === ALETTER) && 300 (nextType === NUMERIC || nextType === ALETTER)) { 301 return false; 302 } 303 304 // WB11. Don't break inside numeric sequences like "3.2" or 305 // "3,456.789". 306 if ((type === MIDNUM || type === MIDNUMLET) && 307 nextType === NUMERIC && 308 prevType === NUMERIC) { 309 return false; 310 } 311 312 // WB12. Don't break inside numeric sequences like "3.2" or 313 // "3,456.789". 314 if (type === NUMERIC && 315 (nextType === MIDNUM || nextType === MIDNUMLET) && 316 nextNextType === NUMERIC) { 317 return false; 318 } 319 320 // WB4. Ignore format and extend characters. 321 if (type === EXTEND || type === FORMAT || 322 prevType === EXTEND || prevType === FORMAT || 323 nextType === EXTEND || nextType === FORMAT) { 324 return false; 325 } 326 327 // WB3. Don't break inside CRLF. 328 if (type === CR && nextType === LF) { 329 return false; 330 } 331 332 // WB3a. Break before newlines (including CR and LF). 333 if (type === NEWLINE || type === CR || type === LF) { 334 return true; 335 } 336 337 // WB3b. Break after newlines (including CR and LF). 338 if (nextType === NEWLINE || nextType === CR || nextType === LF) { 339 return true; 340 } 341 342 // WB13. Don't break between Katakana characters. 343 if (type === KATAKANA && nextType === KATAKANA) { 344 return false; 345 } 346 347 // WB13a. Don't break from extenders. 348 if (nextType === EXTENDNUMLET && 349 (type === ALETTER || type === NUMERIC || type === KATAKANA || 350 type === EXTENDNUMLET)) { 351 return false; 352 } 353 354 // WB13b. Don't break from extenders. 355 if (type === EXTENDNUMLET && 356 (nextType === ALETTER || nextType === NUMERIC || 357 nextType === KATAKANA)) { 358 return false; 359 } 360 361 // Break after any character not covered by the rules above. 362 return true; 363 } 364 }; 365 366 Text.WordBreak = WordBreak; 367 368 369 }, '3.17.2', {"requires": ["array-extras", "text-data-wordbreak"]});
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Aug 11 10:00:09 2016 | Cross-referenced by PHPXref 0.7.1 |