PHPXRef 0.7.1 : Unnamed Project : /lib/yuilib/3.17.2/text-wordbreak/text-wordbreak.js source

[Summary view] [Print] [Text view]
   1  /*
   2  YUI 3.17.2 (build 9c3c78e)
   3  Copyright 2014 Yahoo! Inc. All rights reserved.
   4  Licensed under the BSD License.
   5  http://yuilibrary.com/license/
   6  */
   7  
   8  YUI.add('text-wordbreak', function (Y, NAME) {
   9  
  10  /**
  11   * Provides utility methods for splitting strings on word breaks and determining
  12   * whether a character index represents a word boundary.
  13   *
  14   * @module text
  15   * @submodule text-wordbreak
  16   */
  17  
  18  /**
  19   * <p>
  20   * Provides utility methods for splitting strings on word breaks and determining
  21   * whether a character index represents a word boundary, using the generic word
  22   * breaking algorithm defined in the Unicode Text Segmentation guidelines
  23   * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
  24   * Annex #29</a>).
  25   * </p>
  26   *
  27   * <p>
  28   * This algorithm provides a reasonable default for many languages. However, it
  29   * does not cover language or context specific requirements, and it does not
  30   * provide meaningful results at all for languages that don't use spaces between
  31   * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
  32   * word breaking services usually provide significantly better results with
  33   * better performance.
  34   * </p>
  35   *
  36   * @class Text.WordBreak
  37   * @static
  38   */
  39  
  40  var Text   = Y.Text,
  41      WBData = Text.Data.WordBreak,
  42  
  43  // Constants representing code point classifications.
  44  ALETTER      = 0,
  45  MIDNUMLET    = 1,
  46  MIDLETTER    = 2,
  47  MIDNUM       = 3,
  48  NUMERIC      = 4,
  49  CR           = 5,
  50  LF           = 6,
  51  NEWLINE      = 7,
  52  EXTEND       = 8,
  53  FORMAT       = 9,
  54  KATAKANA     = 10,
  55  EXTENDNUMLET = 11,
  56  OTHER        = 12,
  57  
  58  // RegExp objects generated from code point data. Each regex matches a single
  59  // character against a set of Unicode code points. The index of each item in
  60  // this array must match its corresponding code point constant value defined
  61  // above.
  62  SETS = [
  63      new RegExp(WBData.aletter),
  64      new RegExp(WBData.midnumlet),
  65      new RegExp(WBData.midletter),
  66      new RegExp(WBData.midnum),
  67      new RegExp(WBData.numeric),
  68      new RegExp(WBData.cr),
  69      new RegExp(WBData.lf),
  70      new RegExp(WBData.newline),
  71      new RegExp(WBData.extend),
  72      new RegExp(WBData.format),
  73      new RegExp(WBData.katakana),
  74      new RegExp(WBData.extendnumlet)
  75  ],
  76  
  77  EMPTY_STRING = '',
  78  PUNCTUATION  = new RegExp('^' + WBData.punctuation + '$'),
  79  WHITESPACE   = /\s/,
  80  
  81  WordBreak = {
  82      // -- Public Static Methods ------------------------------------------------
  83  
  84      /**
  85       * Splits the specified string into an array of individual words.
  86       *
  87       * @method getWords
  88       * @param {String} string String to split.
  89       * @param {Object} options (optional) Options object containing zero or more
  90       *   of the following properties:
  91       *
  92       * <dl>
  93       *   <dt>ignoreCase (Boolean)</dt>
  94       *   <dd>
  95       *     If <code>true</code>, the string will be converted to lowercase
  96       *     before being split. Default is <code>false</code>.
  97       *   </dd>
  98       *
  99       *   <dt>includePunctuation (Boolean)</dt>
 100       *   <dd>
 101       *     If <code>true</code>, the returned array will include punctuation
 102       *     characters. Default is <code>false</code>.
 103       *   </dd>
 104       *
 105       *   <dt>includeWhitespace (Boolean)</dt>
 106       *   <dd>
 107       *     If <code>true</code>, the returned array will include whitespace
 108       *     characters. Default is <code>false</code>.
 109       *   </dd>
 110       * </dl>
 111       * @return {Array} Array of words.
 112       * @static
 113       */
 114      getWords: function (string, options) {
 115          var i     = 0,
 116              map   = WordBreak._classify(string),
 117              len   = map.length,
 118              word  = [],
 119              words = [],
 120              chr,
 121              includePunctuation,
 122              includeWhitespace;
 123  
 124          if (!options) {
 125              options = {};
 126          }
 127  
 128          if (options.ignoreCase) {
 129              string = string.toLowerCase();
 130          }
 131  
 132          includePunctuation = options.includePunctuation;
 133          includeWhitespace  = options.includeWhitespace;
 134  
 135          // Loop through each character in the classification map and determine
 136          // whether it precedes a word boundary, building an array of distinct
 137          // words as we go.
 138          for (; i < len; ++i) {
 139              chr = string.charAt(i);
 140  
 141              // Append this character to the current word.
 142              word.push(chr);
 143  
 144              // If there's a word boundary between the current character and the
 145              // next character, append the current word to the words array and
 146              // start building a new word.
 147              if (WordBreak._isWordBoundary(map, i)) {
 148                  word = word.join(EMPTY_STRING);
 149  
 150                  if (word &&
 151                          (includeWhitespace  || !WHITESPACE.test(word)) &&
 152                          (includePunctuation || !PUNCTUATION.test(word))) {
 153                      words.push(word);
 154                  }
 155  
 156                  word = [];
 157              }
 158          }
 159  
 160          return words;
 161      },
 162  
 163      /**
 164       * Returns an array containing only unique words from the specified string.
 165       * For example, the string <code>'foo bar baz foo'</code> would result in
 166       * the array <code>['foo', 'bar', 'baz']</code>.
 167       *
 168       * @method getUniqueWords
 169       * @param {String} string String to split.
 170       * @param {Object} options (optional) Options (see <code>getWords()</code>
 171       *   for details).
 172       * @return {Array} Array of unique words.
 173       * @static
 174       */
 175      getUniqueWords: function (string, options) {
 176          return Y.Array.unique(WordBreak.getWords(string, options));
 177      },
 178  
 179      /**
 180       * <p>
 181       * Returns <code>true</code> if there is a word boundary between the
 182       * specified character index and the next character index (or the end of the
 183       * string).
 184       * </p>
 185       *
 186       * <p>
 187       * Note that there are always word breaks at the beginning and end of a
 188       * string, so <code>isWordBoundary('', 0)</code> and
 189       * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
 190       * </p>
 191       *
 192       * @method isWordBoundary
 193       * @param {String} string String to test.
 194       * @param {Number} index Character index to test within the string.
 195       * @return {Boolean} <code>true</code> for a word boundary,
 196       *   <code>false</code> otherwise.
 197       * @static
 198       */
 199      isWordBoundary: function (string, index) {
 200          return WordBreak._isWordBoundary(WordBreak._classify(string), index);
 201      },
 202  
 203      // -- Protected Static Methods ---------------------------------------------
 204  
 205      /**
 206       * Returns a character classification map for the specified string.
 207       *
 208       * @method _classify
 209       * @param {String} string String to classify.
 210       * @return {Array} Classification map.
 211       * @protected
 212       * @static
 213       */
 214      _classify: function (string) {
 215          var chr,
 216              map          = [],
 217              i            = 0,
 218              j,
 219              set,
 220              stringLength = string.length,
 221              setsLength   = SETS.length,
 222              type;
 223  
 224          for (; i < stringLength; ++i) {
 225              chr  = string.charAt(i);
 226              type = OTHER;
 227  
 228              for (j = 0; j < setsLength; ++j) {
 229                  set = SETS[j];
 230  
 231                  if (set && set.test(chr)) {
 232                      type = j;
 233                      break;
 234                  }
 235              }
 236  
 237              map.push(type);
 238          }
 239  
 240          return map;
 241      },
 242  
 243      /**
 244       * <p>
 245       * Returns <code>true</code> if there is a word boundary between the
 246       * specified character index and the next character index (or the end of the
 247       * string).
 248       * </p>
 249       *
 250       * <p>
 251       * Note that there are always word breaks at the beginning and end of a
 252       * string, so <code>_isWordBoundary('', 0)</code> and
 253       * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
 254       * </p>
 255       *
 256       * @method _isWordBoundary
 257       * @param {Array} map Character classification map generated by
 258       *   <code>_classify</code>.
 259       * @param {Number} index Character index to test.
 260       * @return {Boolean}
 261       * @protected
 262       * @static
 263       */
 264      _isWordBoundary: function (map, index) {
 265          var prevType,
 266              type     = map[index],
 267              nextType = map[index + 1],
 268              nextNextType;
 269  
 270          if (index < 0 || (index > map.length - 1 && index !== 0)) {
 271              return false;
 272          }
 273  
 274          // WB5. Don't break between most letters.
 275          if (type === ALETTER && nextType === ALETTER) {
 276              return false;
 277          }
 278  
 279          nextNextType = map[index + 2];
 280  
 281          // WB6. Don't break letters across certain punctuation.
 282          if (type === ALETTER &&
 283                  (nextType === MIDLETTER || nextType === MIDNUMLET) &&
 284                  nextNextType === ALETTER) {
 285              return false;
 286          }
 287  
 288          prevType = map[index - 1];
 289  
 290          // WB7. Don't break letters across certain punctuation.
 291          if ((type === MIDLETTER || type === MIDNUMLET) &&
 292                  nextType === ALETTER &&
 293                  prevType === ALETTER) {
 294              return false;
 295          }
 296  
 297          // WB8/WB9/WB10. Don't break inside sequences of digits or digits
 298          // adjacent to letters.
 299          if ((type === NUMERIC || type === ALETTER) &&
 300                  (nextType === NUMERIC || nextType === ALETTER)) {
 301              return false;
 302          }
 303  
 304          // WB11. Don't break inside numeric sequences like "3.2" or
 305          // "3,456.789".
 306          if ((type === MIDNUM || type === MIDNUMLET) &&
 307                  nextType === NUMERIC &&
 308                  prevType === NUMERIC) {
 309              return false;
 310          }
 311  
 312          // WB12. Don't break inside numeric sequences like "3.2" or
 313          // "3,456.789".
 314          if (type === NUMERIC &&
 315                  (nextType === MIDNUM || nextType === MIDNUMLET) &&
 316                  nextNextType === NUMERIC) {
 317              return false;
 318          }
 319  
 320          // WB4. Ignore format and extend characters.
 321          if (type === EXTEND || type === FORMAT ||
 322                  prevType === EXTEND || prevType === FORMAT ||
 323                  nextType === EXTEND || nextType === FORMAT) {
 324              return false;
 325          }
 326  
 327          // WB3. Don't break inside CRLF.
 328          if (type === CR && nextType === LF) {
 329              return false;
 330          }
 331  
 332          // WB3a. Break before newlines (including CR and LF).
 333          if (type === NEWLINE || type === CR || type === LF) {
 334              return true;
 335          }
 336  
 337          // WB3b. Break after newlines (including CR and LF).
 338          if (nextType === NEWLINE || nextType === CR || nextType === LF) {
 339              return true;
 340          }
 341  
 342          // WB13. Don't break between Katakana characters.
 343          if (type === KATAKANA && nextType === KATAKANA) {
 344              return false;
 345          }
 346  
 347          // WB13a. Don't break from extenders.
 348          if (nextType === EXTENDNUMLET &&
 349                  (type === ALETTER || type === NUMERIC || type === KATAKANA ||
 350                  type === EXTENDNUMLET)) {
 351              return false;
 352          }
 353  
 354          // WB13b. Don't break from extenders.
 355          if (type === EXTENDNUMLET &&
 356                  (nextType === ALETTER || nextType === NUMERIC ||
 357                  nextType === KATAKANA)) {
 358              return false;
 359          }
 360  
 361          // Break after any character not covered by the rules above.
 362          return true;
 363      }
 364  };
 365  
 366  Text.WordBreak = WordBreak;
 367  
 368  
 369  }, '3.17.2', {"requires": ["array-extras", "text-data-wordbreak"]});
PHP Cross Reference of Unnamed Project

/lib/yuilib/3.17.2/text-wordbreak/ -> text-wordbreak.js (source)