PHPXRef 0.7.1 : Unnamed Project : /lib/yuilib/3.17.2/text-wordbreak/text-wordbreak-debug.js source

[Summary view] [Print] [Text view]
   1  /*
   2  YUI 3.17.2 (build 9c3c78e)
   3  Copyright 2014 Yahoo! Inc. All rights reserved.
   4  Licensed under the BSD License.
   5  http://yuilibrary.com/license/
   6  */
   7  
   8  YUI.add('text-wordbreak', function (Y, NAME) {
   9  
  10  /**
  11   * Provides utility methods for splitting strings on word breaks and determining
  12   * whether a character index represents a word boundary.
  13   *
  14   * @module text
  15   * @submodule text-wordbreak
  16   */
  17  
  18  /**
  19   * <p>
  20   * Provides utility methods for splitting strings on word breaks and determining
  21   * whether a character index represents a word boundary, using the generic word
  22   * breaking algorithm defined in the Unicode Text Segmentation guidelines
  23   * (<a href="http://unicode.org/reports/tr29/#Word_Boundaries">Unicode Standard
  24   * Annex #29</a>).
  25   * </p>
  26   *
  27   * <p>
  28   * This algorithm provides a reasonable default for many languages. However, it
  29   * does not cover language or context specific requirements, and it does not
  30   * provide meaningful results at all for languages that don't use spaces between
  31   * words, such as Chinese, Japanese, Thai, Lao, Khmer, and others. Server-based
  32   * word breaking services usually provide significantly better results with
  33   * better performance.
  34   * </p>
  35   *
  36   * @class Text.WordBreak
  37   * @static
  38   */
  39  
  40  var Text   = Y.Text,
  41      WBData = Text.Data.WordBreak,
  42  
  43  // Constants representing code point classifications.
  44  ALETTER      = 0,
  45  MIDNUMLET    = 1,
  46  MIDLETTER    = 2,
  47  MIDNUM       = 3,
  48  NUMERIC      = 4,
  49  CR           = 5,
  50  LF           = 6,
  51  NEWLINE      = 7,
  52  EXTEND       = 8,
  53  FORMAT       = 9,
  54  KATAKANA     = 10,
  55  EXTENDNUMLET = 11,
  56  OTHER        = 12,
  57  
  58  // RegExp objects generated from code point data. Each regex matches a single
  59  // character against a set of Unicode code points. The index of each item in
  60  // this array must match its corresponding code point constant value defined
  61  // above.
  62  SETS = [
  63      new RegExp(WBData.aletter),
  64      new RegExp(WBData.midnumlet),
  65      new RegExp(WBData.midletter),
  66      new RegExp(WBData.midnum),
  67      new RegExp(WBData.numeric),
  68      new RegExp(WBData.cr),
  69      new RegExp(WBData.lf),
  70      new RegExp(WBData.newline),
  71      new RegExp(WBData.extend),
  72      new RegExp(WBData.format),
  73      new RegExp(WBData.katakana),
  74      new RegExp(WBData.extendnumlet)
  75  ],
  76  
  77  EMPTY_STRING = '',
  78  PUNCTUATION  = new RegExp('^' + WBData.punctuation + '$'),
  79  WHITESPACE   = /\s/,
  80  
  81  WordBreak = {
  82      // -- Public Static Methods ------------------------------------------------
  83  
  84      /**
  85       * Splits the specified string into an array of individual words.
  86       *
  87       * @method getWords
  88       * @param {String} string String to split.
  89       * @param {Object} options (optional) Options object containing zero or more
  90       *   of the following properties:
  91       *
  92       * <dl>
  93       *   <dt>ignoreCase (Boolean)</dt>
  94       *   <dd>
  95       *     If <code>true</code>, the string will be converted to lowercase
  96       *     before being split. Default is <code>false</code>.
  97       *   </dd>
  98       *
  99       *   <dt>includePunctuation (Boolean)</dt>
 100       *   <dd>
 101       *     If <code>true</code>, the returned array will include punctuation
 102       *     characters. Default is <code>false</code>.
 103       *   </dd>
 104       *
 105       *   <dt>includeWhitespace (Boolean)</dt>
 106       *   <dd>
 107       *     If <code>true</code>, the returned array will include whitespace
 108       *     characters. Default is <code>false</code>.
 109       *   </dd>
 110       * </dl>
 111       * @return {Array} Array of words.
 112       * @static
 113       */
 114      getWords: function (string, options) {
 115          var i     = 0,
 116              map   = WordBreak._classify(string),
 117              len   = map.length,
 118              word  = [],
 119              words = [],
 120              chr,
 121              includePunctuation,
 122              includeWhitespace;
 123  
 124          if (!options) {
 125              options = {};
 126          }
 127  
 128          if (options.ignoreCase) {
 129              string = string.toLowerCase();
 130          }
 131  
 132          includePunctuation = options.includePunctuation;
 133          includeWhitespace  = options.includeWhitespace;
 134  
 135          // Loop through each character in the classification map and determine
 136          // whether it precedes a word boundary, building an array of distinct
 137          // words as we go.
 138          for (; i < len; ++i) {
 139              chr = string.charAt(i);
 140  
 141              // Append this character to the current word.
 142              word.push(chr);
 143  
 144              // If there's a word boundary between the current character and the
 145              // next character, append the current word to the words array and
 146              // start building a new word.
 147              if (WordBreak._isWordBoundary(map, i)) {
 148                  word = word.join(EMPTY_STRING);
 149  
 150                  if (word &&
 151                          (includeWhitespace  || !WHITESPACE.test(word)) &&
 152                          (includePunctuation || !PUNCTUATION.test(word))) {
 153                      words.push(word);
 154                  }
 155  
 156                  word = [];
 157              }
 158          }
 159  
 160          return words;
 161      },
 162  
 163      /**
 164       * Returns an array containing only unique words from the specified string.
 165       * For example, the string <code>'foo bar baz foo'</code> would result in
 166       * the array <code>['foo', 'bar', 'baz']</code>.
 167       *
 168       * @method getUniqueWords
 169       * @param {String} string String to split.
 170       * @param {Object} options (optional) Options (see <code>getWords()</code>
 171       *   for details).
 172       * @return {Array} Array of unique words.
 173       * @static
 174       */
 175      getUniqueWords: function (string, options) {
 176          return Y.Array.unique(WordBreak.getWords(string, options));
 177      },
 178  
 179      /**
 180       * <p>
 181       * Returns <code>true</code> if there is a word boundary between the
 182       * specified character index and the next character index (or the end of the
 183       * string).
 184       * </p>
 185       *
 186       * <p>
 187       * Note that there are always word breaks at the beginning and end of a
 188       * string, so <code>isWordBoundary('', 0)</code> and
 189       * <code>isWordBoundary('a', 0)</code> will both return <code>true</code>.
 190       * </p>
 191       *
 192       * @method isWordBoundary
 193       * @param {String} string String to test.
 194       * @param {Number} index Character index to test within the string.
 195       * @return {Boolean} <code>true</code> for a word boundary,
 196       *   <code>false</code> otherwise.
 197       * @static
 198       */
 199      isWordBoundary: function (string, index) {
 200          return WordBreak._isWordBoundary(WordBreak._classify(string), index);
 201      },
 202  
 203      // -- Protected Static Methods ---------------------------------------------
 204  
 205      /**
 206       * Returns a character classification map for the specified string.
 207       *
 208       * @method _classify
 209       * @param {String} string String to classify.
 210       * @return {Array} Classification map.
 211       * @protected
 212       * @static
 213       */
 214      _classify: function (string) {
 215          var chr,
 216              map          = [],
 217              i            = 0,
 218              j,
 219              set,
 220              stringLength = string.length,
 221              setsLength   = SETS.length,
 222              type;
 223  
 224          for (; i < stringLength; ++i) {
 225              chr  = string.charAt(i);
 226              type = OTHER;
 227  
 228              for (j = 0; j < setsLength; ++j) {
 229                  set = SETS[j];
 230  
 231                  if (set && set.test(chr)) {
 232                      type = j;
 233                      break;
 234                  }
 235              }
 236  
 237              map.push(type);
 238          }
 239  
 240          return map;
 241      },
 242  
 243      /**
 244       * <p>
 245       * Returns <code>true</code> if there is a word boundary between the
 246       * specified character index and the next character index (or the end of the
 247       * string).
 248       * </p>
 249       *
 250       * <p>
 251       * Note that there are always word breaks at the beginning and end of a
 252       * string, so <code>_isWordBoundary('', 0)</code> and
 253       * <code>_isWordBoundary('a', 0)</code> will both return <code>true</code>.
 254       * </p>
 255       *
 256       * @method _isWordBoundary
 257       * @param {Array} map Character classification map generated by
 258       *   <code>_classify</code>.
 259       * @param {Number} index Character index to test.
 260       * @return {Boolean}
 261       * @protected
 262       * @static
 263       */
 264      _isWordBoundary: function (map, index) {
 265          var prevType,
 266              type     = map[index],
 267              nextType = map[index + 1],
 268              nextNextType;
 269  
 270          if (index < 0 || (index > map.length - 1 && index !== 0)) {
 271              Y.log('isWordBoundary: index out of bounds', 'warn', 'text-wordbreak');
 272              return false;
 273          }
 274  
 275          // WB5. Don't break between most letters.
 276          if (type === ALETTER && nextType === ALETTER) {
 277              return false;
 278          }
 279  
 280          nextNextType = map[index + 2];
 281  
 282          // WB6. Don't break letters across certain punctuation.
 283          if (type === ALETTER &&
 284                  (nextType === MIDLETTER || nextType === MIDNUMLET) &&
 285                  nextNextType === ALETTER) {
 286              return false;
 287          }
 288  
 289          prevType = map[index - 1];
 290  
 291          // WB7. Don't break letters across certain punctuation.
 292          if ((type === MIDLETTER || type === MIDNUMLET) &&
 293                  nextType === ALETTER &&
 294                  prevType === ALETTER) {
 295              return false;
 296          }
 297  
 298          // WB8/WB9/WB10. Don't break inside sequences of digits or digits
 299          // adjacent to letters.
 300          if ((type === NUMERIC || type === ALETTER) &&
 301                  (nextType === NUMERIC || nextType === ALETTER)) {
 302              return false;
 303          }
 304  
 305          // WB11. Don't break inside numeric sequences like "3.2" or
 306          // "3,456.789".
 307          if ((type === MIDNUM || type === MIDNUMLET) &&
 308                  nextType === NUMERIC &&
 309                  prevType === NUMERIC) {
 310              return false;
 311          }
 312  
 313          // WB12. Don't break inside numeric sequences like "3.2" or
 314          // "3,456.789".
 315          if (type === NUMERIC &&
 316                  (nextType === MIDNUM || nextType === MIDNUMLET) &&
 317                  nextNextType === NUMERIC) {
 318              return false;
 319          }
 320  
 321          // WB4. Ignore format and extend characters.
 322          if (type === EXTEND || type === FORMAT ||
 323                  prevType === EXTEND || prevType === FORMAT ||
 324                  nextType === EXTEND || nextType === FORMAT) {
 325              return false;
 326          }
 327  
 328          // WB3. Don't break inside CRLF.
 329          if (type === CR && nextType === LF) {
 330              return false;
 331          }
 332  
 333          // WB3a. Break before newlines (including CR and LF).
 334          if (type === NEWLINE || type === CR || type === LF) {
 335              return true;
 336          }
 337  
 338          // WB3b. Break after newlines (including CR and LF).
 339          if (nextType === NEWLINE || nextType === CR || nextType === LF) {
 340              return true;
 341          }
 342  
 343          // WB13. Don't break between Katakana characters.
 344          if (type === KATAKANA && nextType === KATAKANA) {
 345              return false;
 346          }
 347  
 348          // WB13a. Don't break from extenders.
 349          if (nextType === EXTENDNUMLET &&
 350                  (type === ALETTER || type === NUMERIC || type === KATAKANA ||
 351                  type === EXTENDNUMLET)) {
 352              return false;
 353          }
 354  
 355          // WB13b. Don't break from extenders.
 356          if (type === EXTENDNUMLET &&
 357                  (nextType === ALETTER || nextType === NUMERIC ||
 358                  nextType === KATAKANA)) {
 359              return false;
 360          }
 361  
 362          // Break after any character not covered by the rules above.
 363          return true;
 364      }
 365  };
 366  
 367  Text.WordBreak = WordBreak;
 368  
 369  
 370  }, '3.17.2', {"requires": ["array-extras", "text-data-wordbreak"]});
PHP Cross Reference of Unnamed Project

/lib/yuilib/3.17.2/text-wordbreak/ -> text-wordbreak-debug.js (source)