[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 /* 2 YUI 3.17.2 (build 9c3c78e) 3 Copyright 2014 Yahoo! Inc. All rights reserved. 4 Licensed under the BSD License. 5 http://yuilibrary.com/license/ 6 */ 7 8 YUI.add('text-wordbreak', function (Y, NAME) { 9 10 /** 11 * Provides utility methods for splitting strings on word breaks and determining 12 * whether a character index represents a word boundary. 13 * 14 * @module text 15 * @submodule text-wordbreak 16 */ 17 18 /** 19 * <p> 20 * Provides utility methods for splitting strings on word breaks and determining 21 * whether a character index represents a word boundary, using the generic word 22 * breaking algorithm defined in the Unicode Text Segmentation guidelines 23 * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard 24 * Annex #29</a>). 25 * </p> 26 * 27 * <p> 28 * This algorithm provides a reasonable default for many languages. However, it 29 * does not cover language or context specific requirements, and it does not 30 * provide meaningful results at all for languages that don't use spaces between 31 * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based 32 * word breaking services usually provide significantly better results with 33 * better performance. 34 * </p> 35 * 36 * @class Text.WordBreak 37 * @static 38 */ 39 40 var Text = Y.Text, 41 WBData = Text.Data.WordBreak, 42 43 // Constants representing code point classifications. 44 ALETTER = 0, 45 MIDNUMLET = 1, 46 MIDLETTER = 2, 47 MIDNUM = 3, 48 NUMERIC = 4, 49 CR = 5, 50 LF = 6, 51 NEWLINE = 7, 52 EXTEND = 8, 53 FORMAT = 9, 54 KATAKANA = 10, 55 EXTENDNUMLET = 11, 56 OTHER = 12, 57 58 // RegExp objects generated from code point data. Each regex matches a single 59 // character against a set of Unicode code points. The index of each item in 60 // this array must match its corresponding code point constant value defined 61 // above. 62 SETS = [ 63 new RegExp(WBData.aletter), 64 new RegExp(WBData.midnumlet), 65 new RegExp(WBData.midletter), 66 new RegExp(WBData.midnum), 67 new RegExp(WBData.numeric), 68 new RegExp(WBData.cr), 69 new RegExp(WBData.lf), 70 new RegExp(WBData.newline), 71 new RegExp(WBData.extend), 72 new RegExp(WBData.format), 73 new RegExp(WBData.katakana), 74 new RegExp(WBData.extendnumlet) 75 ], 76 77 EMPTY_STRING = '', 78 PUNCTUATION = new RegExp('^' + WBData.punctuation + '$'), 79 WHITESPACE = /\s/, 80 81 WordBreak = { 82 // -- Public Static Methods ------------------------------------------------ 83 84 /** 85 * Splits the specified string into an array of individual words. 86 * 87 * @method getWords 88 * @param {String} string String to split. 89 * @param {Object} options (optional) Options object containing zero or more 90 * of the following properties: 91 * 92 * <dl> 93 * <dt>ignoreCase (Boolean)</dt> 94 * <dd> 95 * If <code>true</code>, the string will be converted to lowercase 96 * before being split. Default is <code>false</code>. 97 * </dd> 98 * 99 * <dt>includePunctuation (Boolean)</dt> 100 * <dd> 101 * If <code>true</code>, the returned array will include punctuation 102 * characters. Default is <code>false</code>. 103 * </dd> 104 * 105 * <dt>includeWhitespace (Boolean)</dt> 106 * <dd> 107 * If <code>true</code>, the returned array will include whitespace 108 * characters. Default is <code>false</code>. 109 * </dd> 110 * </dl> 111 * @return {Array} Array of words. 112 * @static 113 */ 114 getWords: function (string, options) { 115 var i = 0, 116 map = WordBreak._classify(string), 117 len = map.length, 118 word = [], 119 words = [], 120 chr, 121 includePunctuation, 122 includeWhitespace; 123 124 if (!options) { 125 options = {}; 126 } 127 128 if (options.ignoreCase) { 129 string = string.toLowerCase(); 130 } 131 132 includePunctuation = options.includePunctuation; 133 includeWhitespace = options.includeWhitespace; 134 135 // Loop through each character in the classification map and determine 136 // whether it precedes a word boundary, building an array of distinct 137 // words as we go. 138 for (; i < len; ++i) { 139 chr = string.charAt(i); 140 141 // Append this character to the current word. 142 word.push(chr); 143 144 // If there's a word boundary between the current character and the 145 // next character, append the current word to the words array and 146 // start building a new word. 147 if (WordBreak._isWordBoundary(map, i)) { 148 word = word.join(EMPTY_STRING); 149 150 if (word && 151 (includeWhitespace || !WHITESPACE.test(word)) && 152 (includePunctuation || !PUNCTUATION.test(word))) { 153 words.push(word); 154 } 155 156 word = []; 157 } 158 } 159 160 return words; 161 }, 162 163 /** 164 * Returns an array containing only unique words from the specified string. 165 * For example, the string <code>'foo bar baz foo'</code> would result in 166 * the array <code>['foo', 'bar', 'baz']</code>. 167 * 168 * @method getUniqueWords 169 * @param {String} string String to split. 170 * @param {Object} options (optional) Options (see <code>getWords()</code> 171 * for details). 172 * @return {Array} Array of unique words. 173 * @static 174 */ 175 getUniqueWords: function (string, options) { 176 return Y.Array.unique(WordBreak.getWords(string, options)); 177 }, 178 179 /** 180 * <p> 181 * Returns <code>true</code> if there is a word boundary between the 182 * specified character index and the next character index (or the end of the 183 * string). 184 * </p> 185 * 186 * <p> 187 * Note that there are always word breaks at the beginning and end of a 188 * string, so <code>isWordBoundary('', 0)</code> and 189 * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>. 190 * </p> 191 * 192 * @method isWordBoundary 193 * @param {String} string String to test. 194 * @param {Number} index Character index to test within the string. 195 * @return {Boolean} <code>true</code> for a word boundary, 196 * <code>false</code> otherwise. 197 * @static 198 */ 199 isWordBoundary: function (string, index) { 200 return WordBreak._isWordBoundary(WordBreak._classify(string), index); 201 }, 202 203 // -- Protected Static Methods --------------------------------------------- 204 205 /** 206 * Returns a character classification map for the specified string. 207 * 208 * @method _classify 209 * @param {String} string String to classify. 210 * @return {Array} Classification map. 211 * @protected 212 * @static 213 */ 214 _classify: function (string) { 215 var chr, 216 map = [], 217 i = 0, 218 j, 219 set, 220 stringLength = string.length, 221 setsLength = SETS.length, 222 type; 223 224 for (; i < stringLength; ++i) { 225 chr = string.charAt(i); 226 type = OTHER; 227 228 for (j = 0; j < setsLength; ++j) { 229 set = SETS[j]; 230 231 if (set && set.test(chr)) { 232 type = j; 233 break; 234 } 235 } 236 237 map.push(type); 238 } 239 240 return map; 241 }, 242 243 /** 244 * <p> 245 * Returns <code>true</code> if there is a word boundary between the 246 * specified character index and the next character index (or the end of the 247 * string). 248 * </p> 249 * 250 * <p> 251 * Note that there are always word breaks at the beginning and end of a 252 * string, so <code>_isWordBoundary('', 0)</code> and 253 * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>. 254 * </p> 255 * 256 * @method _isWordBoundary 257 * @param {Array} map Character classification map generated by 258 * <code>_classify</code>. 259 * @param {Number} index Character index to test. 260 * @return {Boolean} 261 * @protected 262 * @static 263 */ 264 _isWordBoundary: function (map, index) { 265 var prevType, 266 type = map[index], 267 nextType = map[index + 1], 268 nextNextType; 269 270 if (index < 0 || (index > map.length - 1 && index !== 0)) { 271 Y.log('isWordBoundary: index out of bounds', 'warn', 'text-wordbreak'); 272 return false; 273 } 274 275 // WB5. Don't break between most letters. 276 if (type === ALETTER && nextType === ALETTER) { 277 return false; 278 } 279 280 nextNextType = map[index + 2]; 281 282 // WB6. Don't break letters across certain punctuation. 283 if (type === ALETTER && 284 (nextType === MIDLETTER || nextType === MIDNUMLET) && 285 nextNextType === ALETTER) { 286 return false; 287 } 288 289 prevType = map[index - 1]; 290 291 // WB7. Don't break letters across certain punctuation. 292 if ((type === MIDLETTER || type === MIDNUMLET) && 293 nextType === ALETTER && 294 prevType === ALETTER) { 295 return false; 296 } 297 298 // WB8/WB9/WB10. Don't break inside sequences of digits or digits 299 // adjacent to letters. 300 if ((type === NUMERIC || type === ALETTER) && 301 (nextType === NUMERIC || nextType === ALETTER)) { 302 return false; 303 } 304 305 // WB11. Don't break inside numeric sequences like "3.2" or 306 // "3,456.789". 307 if ((type === MIDNUM || type === MIDNUMLET) && 308 nextType === NUMERIC && 309 prevType === NUMERIC) { 310 return false; 311 } 312 313 // WB12. Don't break inside numeric sequences like "3.2" or 314 // "3,456.789". 315 if (type === NUMERIC && 316 (nextType === MIDNUM || nextType === MIDNUMLET) && 317 nextNextType === NUMERIC) { 318 return false; 319 } 320 321 // WB4. Ignore format and extend characters. 322 if (type === EXTEND || type === FORMAT || 323 prevType === EXTEND || prevType === FORMAT || 324 nextType === EXTEND || nextType === FORMAT) { 325 return false; 326 } 327 328 // WB3. Don't break inside CRLF. 329 if (type === CR && nextType === LF) { 330 return false; 331 } 332 333 // WB3a. Break before newlines (including CR and LF). 334 if (type === NEWLINE || type === CR || type === LF) { 335 return true; 336 } 337 338 // WB3b. Break after newlines (including CR and LF). 339 if (nextType === NEWLINE || nextType === CR || nextType === LF) { 340 return true; 341 } 342 343 // WB13. Don't break between Katakana characters. 344 if (type === KATAKANA && nextType === KATAKANA) { 345 return false; 346 } 347 348 // WB13a. Don't break from extenders. 349 if (nextType === EXTENDNUMLET && 350 (type === ALETTER || type === NUMERIC || type === KATAKANA || 351 type === EXTENDNUMLET)) { 352 return false; 353 } 354 355 // WB13b. Don't break from extenders. 356 if (type === EXTENDNUMLET && 357 (nextType === ALETTER || nextType === NUMERIC || 358 nextType === KATAKANA)) { 359 return false; 360 } 361 362 // Break after any character not covered by the rules above. 363 return true; 364 } 365 }; 366 367 Text.WordBreak = WordBreak; 368 369 370 }, '3.17.2', {"requires": ["array-extras", "text-data-wordbreak"]});
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Aug 11 10:00:09 2016 | Cross-referenced by PHPXref 0.7.1 |