text_decoder_utils.js 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. //
  2. // Utilities
  3. //
  4. /**
  5. * @param {number} a The number to test.
  6. * @param {number} min The minimum value in the range, inclusive.
  7. * @param {number} max The maximum value in the range, inclusive.
  8. * @return {boolean} True if a >= min and a <= max.
  9. */
  10. export function inRange(a, min, max) {
  11. return min <= a && a <= max
  12. }
  13. export const floor = Math.floor
  14. /**
  15. * @param {string} string Input string of UTF-16 code units.
  16. * @return {!Array.<number>} Code points.
  17. */
  18. export function stringToCodePoints(string) {
  19. // https://heycam.github.io/webidl/#dfn-obtain-unicode
  20. // 1. Let S be the DOMString value.
  21. var s = String(string)
  22. // 2. Let n be the length of S.
  23. var n = s.length
  24. // 3. Initialize i to 0.
  25. var i = 0
  26. // 4. Initialize U to be an empty sequence of Unicode characters.
  27. var u = []
  28. // 5. While i < n:
  29. while (i < n) {
  30. // 1. Let c be the code unit in S at index i.
  31. var c = s.charCodeAt(i)
  32. // 2. Depending on the value of c:
  33. // c < 0xD800 or c > 0xDFFF
  34. if (c < 0xD800 || c > 0xDFFF) {
  35. // Append to U the Unicode character with code point c.
  36. u.push(c)
  37. }
  38. // 0xDC00 ≤ c ≤ 0xDFFF
  39. else if (0xDC00 <= c && c <= 0xDFFF) {
  40. // Append to U a U+FFFD REPLACEMENT CHARACTER.
  41. u.push(0xFFFD)
  42. }
  43. // 0xD800 ≤ c ≤ 0xDBFF
  44. else if (0xD800 <= c && c <= 0xDBFF) {
  45. // 1. If i = n−1, then append to U a U+FFFD REPLACEMENT
  46. // CHARACTER.
  47. if (i === n - 1) {
  48. u.push(0xFFFD)
  49. }
  50. // 2. Otherwise, i < n−1:
  51. else {
  52. // 1. Let d be the code unit in S at index i+1.
  53. var d = s.charCodeAt(i + 1)
  54. // 2. If 0xDC00 ≤ d ≤ 0xDFFF, then:
  55. if (0xDC00 <= d && d <= 0xDFFF) {
  56. // 1. Let a be c & 0x3FF.
  57. var a = c & 0x3FF
  58. // 2. Let b be d & 0x3FF.
  59. var b = d & 0x3FF
  60. // 3. Append to U the Unicode character with code point
  61. // 2^16+2^10*a+b.
  62. u.push(0x10000 + (a << 10) + b)
  63. // 4. Set i to i+1.
  64. i += 1
  65. }
  66. // 3. Otherwise, d < 0xDC00 or d > 0xDFFF. Append to U a
  67. // U+FFFD REPLACEMENT CHARACTER.
  68. else {
  69. u.push(0xFFFD)
  70. }
  71. }
  72. }
  73. // 3. Set i to i+1.
  74. i += 1
  75. }
  76. // 6. Return U.
  77. return u
  78. }
  79. /**
  80. * @param {!Array.<number>} code_points Array of code points.
  81. * @return {string} string String of UTF-16 code units.
  82. */
  83. export function codePointsToString(code_points) {
  84. var s = ''
  85. for (var i = 0; i < code_points.length; ++i) {
  86. var cp = code_points[i]
  87. if (cp <= 0xFFFF) {
  88. s += String.fromCharCode(cp)
  89. } else {
  90. cp -= 0x10000
  91. s += String.fromCharCode((cp >> 10) + 0xD800,
  92. (cp & 0x3FF) + 0xDC00)
  93. }
  94. }
  95. return s
  96. }
  97. /**
  98. * @param {boolean} fatal If true, decoding errors raise an exception.
  99. * @param {number=} opt_code_point Override the standard fallback code point.
  100. * @return The code point to insert on a decoding error.
  101. */
  102. export function decoderError(fatal, opt_code_point) {
  103. if (fatal)
  104. throw TypeError('Decoder error')
  105. return opt_code_point || 0xFFFD
  106. }
  107. /**
  108. * @param {number} code_point The code point that could not be encoded.
  109. * @return {number} Always throws, no value is actually returned.
  110. */
  111. export function encoderError(code_point) {
  112. throw TypeError('The code point ' + code_point + ' could not be encoded.')
  113. }
  114. /**
  115. * @param {number} code_unit
  116. * @param {boolean} utf16be
  117. */
  118. export function convertCodeUnitToBytes(code_unit, utf16be) {
  119. // 1. Let byte1 be code unit >> 8.
  120. const byte1 = code_unit >> 8
  121. // 2. Let byte2 be code unit & 0x00FF.
  122. const byte2 = code_unit & 0x00FF
  123. // 3. Then return the bytes in order:
  124. // utf-16be flag is set: byte1, then byte2.
  125. if (utf16be)
  126. return [byte1, byte2]
  127. // utf-16be flag is unset: byte2, then byte1.
  128. return [byte2, byte1]
  129. }
  130. //
  131. // 4. Terminology
  132. //
  133. /**
  134. * An ASCII byte is a byte in the range 0x00 to 0x7F, inclusive.
  135. * @param {number} a The number to test.
  136. * @return {boolean} True if a is in the range 0x00 to 0x7F, inclusive.
  137. */
  138. export function isASCIIByte(a) {
  139. return 0x00 <= a && a <= 0x7F
  140. }
  141. /**
  142. * An ASCII code point is a code point in the range U+0000 to
  143. * U+007F, inclusive.
  144. */
  145. export const isASCIICodePoint = isASCIIByte
  146. /**
  147. * End-of-stream is a special token that signifies no more tokens are in the stream.
  148. */
  149. export const end_of_stream = -1
  150. export const finished = -1