| Class | ActiveSupport::Multibyte::Handlers::UTF8Handler |
| In: |
vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb
|
| Parent: | Object |
UTF8Handler implements Unicode aware operations for strings, these operations will be used by the Chars proxy when $KCODE is set to ‘UTF8’.
| HANGUL_SBASE | = | 0xAC00 | Hangul character boundaries and properties | |
| HANGUL_LBASE | = | 0x1100 | ||
| HANGUL_VBASE | = | 0x1161 | ||
| HANGUL_TBASE | = | 0x11A7 | ||
| HANGUL_LCOUNT | = | 19 | ||
| HANGUL_VCOUNT | = | 21 | ||
| HANGUL_TCOUNT | = | 28 | ||
| HANGUL_NCOUNT | = | HANGUL_VCOUNT * HANGUL_TCOUNT | ||
| HANGUL_SCOUNT | = | 11172 | ||
| HANGUL_SLAST | = | HANGUL_SBASE + HANGUL_SCOUNT | ||
| HANGUL_JAMO_FIRST | = | 0x1100 | ||
| HANGUL_JAMO_LAST | = | 0x11FF | ||
| UNICODE_WHITESPACE | = | [ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D> 0x0020, # White_Space # Zs SPACE 0x0085, # White_Space # Cc <control-0085> 0x00A0, # White_Space # Zs NO-BREAK SPACE 0x1680, # White_Space # Zs OGHAM SPACE MARK 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE 0x2028, # White_Space # Zl LINE SEPARATOR 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE ].flatten.freeze | All the unicode whitespace | |
| UNICODE_LEADERS_AND_TRAILERS | = | UNICODE_WHITESPACE + [65279] | BOM (byte order mark) can also be seen as whitespace, it‘s a non-rendering character used to distinguish between little and big endian. This is not an issue in utf-8, so it must be ignored. | |
| UTF8_PAT | = | /\A(?: [\x00-\x7f] | [\xc2-\xdf] [\x80-\xbf] | \xe0 [\xa0-\xbf] [\x80-\xbf] | [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] )*\z/xn | Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) | |
| UNICODE_TRAILERS_PAT | = | /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ | ||
| UNICODE_LEADERS_PAT | = | /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ | ||
| UCD | = | UnicodeDatabase.new | UniCode Database |
| size | -> | length |
| slice | -> | [] |
Works just like the indexed replace method on string, except instead of byte offsets you specify character offsets.
Example:
s = "Müller" s.chars[2] = "e" # Replace character with offset 2 s #=> "Müeler" s = "Müller" s.chars[1, 2] = "ö" # Replace 2 characters at character offset 1 s #=> "Möler"
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 158
158: def []=(str, *args)
159: replace_by = args.pop
160: # Indexed replace with regular expressions already works
161: return str[*args] = replace_by if args.first.is_a?(Regexp)
162: result = u_unpack(str)
163: if args[0].is_a?(Fixnum)
164: raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
165: min = args[0]
166: max = args[1].nil? ? min : (min + args[1] - 1)
167: range = Range.new(min, max)
168: replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum)
169: elsif args.first.is_a?(Range)
170: raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length
171: range = args[0]
172: else
173: needle = args[0].to_s
174: min = index(str, needle)
175: max = min + length(needle) - 1
176: range = Range.new(min, max)
177: end
178: result[range] = u_unpack(replace_by)
179: str.replace(result.pack('U*'))
180: end
Returns a copy of str with the first character converted to uppercase and the remainder to lowercase
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 276
276: def capitalize(str)
277: upcase(slice(str, 0..0)) + downcase(slice(str, 1..-1) || '')
278: end
Works just like String#center, only integer specifies characters instead of bytes.
Example:
"¾ cup".chars.center(8).to_s #=> " ¾ cup " "¾ cup".chars.center(8, " ").to_s # Use non-breaking whitespace #=> " ¾ cup "
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 217
217: def center(str, integer, padstr=' ')
218: justify(str, integer, :center, padstr)
219: end
Perform composition on the characters in the string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 312
312: def compose(str)
313: compose_codepoints u_unpack(str).pack('U*')
314: end
Checks if the string is valid UTF8.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 342
342: def consumes?(str)
343: # Unpack is a little bit faster than regular expressions
344: begin
345: str.unpack('U*')
346: true
347: rescue ArgumentError
348: false
349: end
350: end
Perform decomposition on the characters in the string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 307
307: def decompose(str)
308: decompose_codepoints(:canonical, u_unpack(str)).pack('U*')
309: end
Convert characters in the string to lowercase
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 273
273: def downcase(str); to_case :lowercase_mapping, str; end
Returns the number of grapheme clusters in the string. This method is very likely to be moved or renamed in future versions.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 354
354: def g_length(str)
355: g_unpack(str).length
356: end
Returns the position of the passed argument in the string, counting in codepoints
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 139
139: def index(str, *args)
140: bidx = str.index(*args)
141: bidx ? (u_unpack(str.slice(0...bidx)).size) : nil
142: end
Inserts the passed string at specified codepoint offsets
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 129
129: def insert(str, offset, fragment)
130: str.replace(
131: u_unpack(str).insert(
132: offset,
133: u_unpack(fragment)
134: ).flatten.pack('U*')
135: )
136: end
Works just like String#ljust, only integer specifies characters instead of bytes.
Example:
"¾ cup".chars.rjust(8).to_s #=> "¾ cup " "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace #=> "¾ cup "
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 204
204: def ljust(str, integer, padstr=' ')
205: justify(str, integer, :left, padstr)
206: end
Returns the KC normalization of the string by default. NFKC is considered the best normalization form for passing strings to databases and validations.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 289
289: def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM)
290: # See http://www.unicode.org/reports/tr15, Table 1
291: codepoints = u_unpack(str)
292: case form
293: when :d
294: reorder_characters(decompose_codepoints(:canonical, codepoints))
295: when :c
296: compose_codepoints reorder_characters(decompose_codepoints(:canonical, codepoints))
297: when :kd
298: reorder_characters(decompose_codepoints(:compatability, codepoints))
299: when :kc
300: compose_codepoints reorder_characters(decompose_codepoints(:compatability, codepoints))
301: else
302: raise ArgumentError, "#{form} is not a valid normalization variant", caller
303: end.pack('U*')
304: end
Reverses codepoints in the string.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 243
243: def reverse(str)
244: u_unpack(str).reverse.pack('U*')
245: end
Works just like String#rjust, only integer specifies characters instead of bytes.
Example:
"¾ cup".chars.rjust(8).to_s #=> " ¾ cup" "¾ cup".chars.rjust(8, " ").to_s # Use non-breaking whitespace #=> " ¾ cup"
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 191
191: def rjust(str, integer, padstr=' ')
192: justify(str, integer, :right, padstr)
193: end
Returns the number of codepoints in the string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 237
237: def size(str)
238: u_unpack(str).size
239: end
Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that character.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 249
249: def slice(str, *args)
250: if args.size > 2
251: raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
252: elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp)))
253: raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native
254: elsif (args.size == 2 && !args[1].is_a?(Numeric))
255: raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
256: elsif args[0].kind_of? Range
257: cps = u_unpack(str).slice(*args)
258: cps.nil? ? nil : cps.pack('U*')
259: elsif args[0].kind_of? Regexp
260: str.slice(*args)
261: elsif args.size == 1 && args[0].kind_of?(Numeric)
262: u_unpack(str)[args[0]]
263: else
264: u_unpack(str).slice(*args).pack('U*')
265: end
266: end
Removed leading and trailing whitespace
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 232
232: def strip(str)
233: str.gsub(UNICODE_LEADERS_PAT, '').gsub(UNICODE_TRAILERS_PAT, '')
234: end
Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 359
359: def tidy_bytes(str)
360: str.split(//u).map do |c|
361: if !UTF8_PAT.match(c)
362: n = c.unpack('C')[0]
363: n < 128 ? n.chr :
364: n < 160 ? [UCD.cp1252[n] || n].pack('U') :
365: n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
366: else
367: c
368: end
369: end.join
370: end
Used to translate an offset from bytes to characters, for instance one received from a regular expression match
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 321
321: def translate_offset(str, byte_offset)
322: return nil if byte_offset.nil?
323: return 0 if str == ''
324: chunk = str[0..byte_offset]
325: begin
326: begin
327: chunk.unpack('U*').length - 1
328: rescue ArgumentError => e
329: chunk = str[0..(byte_offset+=1)]
330: # Stop retrying at the end of the string
331: raise e unless byte_offset < chunk.length
332: # We damaged a character, retry
333: retry
334: end
335: # Catch the ArgumentError so we can throw our own
336: rescue ArgumentError
337: raise EncodingError.new('malformed UTF-8 character')
338: end
339: end
Convert characters in the string to uppercase
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 270
270: def upcase(str); to_case :uppercase_mapping, str; end
Compose decomposed characters to the composed form
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 503
503: def compose_codepoints(codepoints)
504: pos = 0
505: eoa = codepoints.length - 1
506: starter_pos = 0
507: starter_char = codepoints[0]
508: previous_combining_class = -1
509: while pos < eoa
510: pos += 1
511: lindex = starter_char - HANGUL_LBASE
512: # -- Hangul
513: if 0 <= lindex and lindex < HANGUL_LCOUNT
514: vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
515: if 0 <= vindex and vindex < HANGUL_VCOUNT
516: tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
517: if 0 <= tindex and tindex < HANGUL_TCOUNT
518: j = starter_pos + 2
519: eoa -= 2
520: else
521: tindex = 0
522: j = starter_pos + 1
523: eoa -= 1
524: end
525: codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
526: end
527: starter_pos += 1
528: starter_char = codepoints[starter_pos]
529: # -- Other characters
530: else
531: current_char = codepoints[pos]
532: current = UCD[current_char]
533: if current.combining_class > previous_combining_class
534: if ref = UCD.composition_map[starter_char]
535: composition = ref[current_char]
536: else
537: composition = nil
538: end
539: unless composition.nil?
540: codepoints[starter_pos] = composition
541: starter_char = composition
542: codepoints.delete_at pos
543: eoa -= 1
544: pos -= 1
545: previous_combining_class = -1
546: else
547: previous_combining_class = current.combining_class
548: end
549: else
550: previous_combining_class = current.combining_class
551: end
552: if current.combining_class == 0
553: starter_pos = pos
554: starter_char = codepoints[pos]
555: end
556: end
557: end
558: codepoints
559: end
Decompose composed characters to the decomposed form
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 482
482: def decompose_codepoints(type, codepoints)
483: codepoints.inject([]) do |decomposed, cp|
484: # if it's a hangul syllable starter character
485: if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
486: sindex = cp - HANGUL_SBASE
487: ncp = [] # new codepoints
488: ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
489: ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
490: tindex = sindex % HANGUL_TCOUNT
491: ncp << (HANGUL_TBASE + tindex) unless tindex == 0
492: decomposed.concat ncp
493: # if the codepoint is decomposable in with the current decomposition type
494: elsif (ncp = UCD[cp].decomp_mapping) and (!UCD[cp].decomp_type || type == :compatability)
495: decomposed.concat decompose_codepoints(type, ncp.dup)
496: else
497: decomposed << cp
498: end
499: end
500: end
Unpack the string at grapheme boundaries instead of codepoint boundaries
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 390
390: def g_unpack(str)
391: codepoints = u_unpack(str)
392: unpacked = []
393: pos = 0
394: marker = 0
395: eoc = codepoints.length
396: while(pos < eoc)
397: pos += 1
398: previous = codepoints[pos-1]
399: current = codepoints[pos]
400: if (
401: # CR X LF
402: one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
403: # L X (L|V|LV|LVT)
404: two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
405: # (LV|V) X (V|T)
406: three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
407: # (LVT|T) X (T)
408: four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
409: # X Extend
410: five = (UCD.boundary[:extend] === current)
411: )
412: else
413: unpacked << codepoints[marker..pos-1]
414: marker = pos
415: end
416: end
417: unpacked
418: end
Detect whether the codepoint is in a certain character class. Primarily used by the grapheme cluster support.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 376
376: def in_char_class?(codepoint, classes)
377: classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
378: end
Justifies a string in a certain way. Valid values for way are :right, :left and :center. Is primarily used as a helper method by rjust, ljust and center.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 427
427: def justify(str, integer, way, padstr=' ')
428: raise ArgumentError, "zero width padding" if padstr.length == 0
429: padsize = integer - size(str)
430: padsize = padsize > 0 ? padsize : 0
431: case way
432: when :right
433: str.dup.insert(0, padding(padsize, padstr))
434: when :left
435: str.dup.insert(-1, padding(padsize, padstr))
436: when :center
437: lpad = padding((padsize / 2.0).floor, padstr)
438: rpad = padding((padsize / 2.0).ceil, padstr)
439: str.dup.insert(0, lpad).insert(-1, rpad)
440: end
441: end
Generates a padding string of a certain size.
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 444
444: def padding(padsize, padstr=' ')
445: if padsize != 0
446: slice(padstr * ((padsize / size(padstr)) + 1), 0, padsize)
447: else
448: ''
449: end
450: end
Re-order codepoints so the string becomes canonical
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 466
466: def reorder_characters(codepoints)
467: length = codepoints.length- 1
468: pos = 0
469: while pos < length do
470: cp1, cp2 = UCD[codepoints[pos]], UCD[codepoints[pos+1]]
471: if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
472: codepoints[pos..pos+1] = cp2.code, cp1.code
473: pos += (pos > 0 ? -1 : 1)
474: else
475: pos += 1
476: end
477: end
478: codepoints
479: end
Convert characters to a different case
# File vendor/rails/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb, line 453
453: def to_case(way, str)
454: u_unpack(str).map do |codepoint|
455: cp = UCD[codepoint]
456: unless cp.nil?
457: ncp = cp.send(way)
458: ncp > 0 ? ncp : codepoint
459: else
460: codepoint
461: end
462: end.pack('U*')
463: end