1 # encoding: utf-8
2
3 module ActiveSupport #:nodoc:
4 module Multibyte #:nodoc:
5 # Chars enables you to work transparently with UTF-8 encoding in the Ruby String class without having extensive
6 # knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an
7 # encoding safe manner. All the normal String methods are also implemented on the proxy.
8 #
9 # String methods are proxied through the Chars object, and can be accessed through the +mb_chars+ method. Methods
10 # which would normally return a String object now return a Chars object so methods can be chained.
11 #
12 # "The Perfect String ".mb_chars.downcase.strip.normalize #=> "the perfect string"
13 #
14 # Chars objects are perfectly interchangeable with String objects as long as no explicit class checks are made.
15 # If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them.
16 #
17 # bad.explicit_checking_method "T".mb_chars.downcase.to_s
18 #
19 # The default Chars implementation assumes that the encoding of the string is UTF-8, if you want to handle different
20 # encodings you can write your own multibyte string handler and configure it through
21 # ActiveSupport::Multibyte.proxy_class.
22 #
23 # class CharsForUTF32
24 # def size
25 # @wrapped_string.size / 4
26 # end
27 #
28 # def self.accepts?(string)
29 # string.length % 4 == 0
30 # end
31 # end
32 #
33 # ActiveSupport::Multibyte.proxy_class = CharsForUTF32
34 class Chars
35 # Hangul character boundaries and properties
36 HANGUL_SBASE = 0xAC00
37 HANGUL_LBASE = 0x1100
38 HANGUL_VBASE = 0x1161
39 HANGUL_TBASE = 0x11A7
40 HANGUL_LCOUNT = 19
41 HANGUL_VCOUNT = 21
42 HANGUL_TCOUNT = 28
43 HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT
44 HANGUL_SCOUNT = 11172
45 HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT
46 HANGUL_JAMO_FIRST = 0x1100
47 HANGUL_JAMO_LAST = 0x11FF
48
49 # All the unicode whitespace
50 UNICODE_WHITESPACE = [
51 (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
52 0x0020, # White_Space # Zs SPACE
53 0x0085, # White_Space # Cc <control-0085>
54 0x00A0, # White_Space # Zs NO-BREAK SPACE
55 0x1680, # White_Space # Zs OGHAM SPACE MARK
56 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
57 (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
58 0x2028, # White_Space # Zl LINE SEPARATOR
59 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
60 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
61 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
62 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
63 ].flatten.freeze
64
65 # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish
66 # between little and big endian. This is not an issue in utf-8, so it must be ignored.
67 UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM
68
69 # Returns a regular expression pattern that matches the passed Unicode codepoints
70 def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
71 array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
72 end
73 UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/
74 UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/
75
76 UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8']
77
78 attr_reader :wrapped_string
79 alias to_s wrapped_string
80 alias to_str wrapped_string
81
82 if '1.9'.respond_to?(:force_encoding)
83 # Creates a new Chars instance by wrapping _string_.
84 def initialize(string)
85 @wrapped_string = string
86 @wrapped_string.force_encoding(Encoding::UTF_8) unless @wrapped_string.frozen?
87 end
88 else
89 def initialize(string) #:nodoc:
90 @wrapped_string = string
91 end
92 end
93
94 # Forward all undefined methods to the wrapped string.
95 def method_missing(method, *args, &block)
96 if method.to_s =~ /!$/
97 @wrapped_string.__send__(method, *args, &block)
98 self
99 else
100 result = @wrapped_string.__send__(method, *args, &block)
101 result.kind_of?(String) ? chars(result) : result
102 end
103 end
104
105 # Returns +true+ if _obj_ responds to the given method. Private methods are included in the search
106 # only if the optional second parameter evaluates to +true+.
107 def respond_to?(method, include_private=false)
108 super || @wrapped_string.respond_to?(method, include_private) || false
109 end
110
111 # Enable more predictable duck-typing on String-like classes. See Object#acts_like?.
112 def acts_like_string?
113 true
114 end
115
116 # Returns +true+ if the Chars class can and should act as a proxy for the string _string_. Returns
117 # +false+ otherwise.
118 def self.wants?(string)
119 $KCODE == 'UTF8' && consumes?(string)
120 end
121
122 # Returns +true+ when the proxy class can handle the string. Returns +false+ otherwise.
123 def self.consumes?(string)
124 # Unpack is a little bit faster than regular expressions.
125 string.unpack('U*')
126 true
127 rescue ArgumentError
128 false
129 end
130
131 include Comparable
132
133 # Returns <tt>-1</tt>, <tt>0</tt> or <tt>+1</tt> depending on whether the Chars object is to be sorted before,
134 # equal or after the object on the right side of the operation. It accepts any object that implements +to_s+.
135 # See <tt>String#<=></tt> for more details.
136 #
137 # Example:
138 # 'é'.mb_chars <=> 'ü'.mb_chars #=> -1
139 def <=>(other)
140 @wrapped_string <=> other.to_s
141 end
142
143 # Returns a new Chars object containing the _other_ object concatenated to the string.
144 #
145 # Example:
146 # ('Café'.mb_chars + ' périferôl').to_s #=> "Café périferôl"
147 def +(other)
148 self << other
149 end
150
151 # Like <tt>String#=~</tt> only it returns the character offset (in codepoints) instead of the byte offset.
152 #
153 # Example:
154 # 'Café périferôl'.mb_chars =~ /ô/ #=> 12
155 def =~(other)
156 translate_offset(@wrapped_string =~ other)
157 end
158
159 # Works just like <tt>String#split</tt>, with the exception that the items in the resulting list are Chars
160 # instances instead of String. This makes chaining methods easier.
161 #
162 # Example:
163 # 'Café périferôl'.mb_chars.split(/é/).map { |part| part.upcase.to_s } #=> ["CAF", " P", "RIFERÔL"]
164 def split(*args)
165 @wrapped_string.split(*args).map { |i| i.mb_chars }
166 end
167
168 # Inserts the passed string at specified codepoint offsets.
169 #
170 # Example:
171 # 'Café'.mb_chars.insert(4, ' périferôl').to_s #=> "Café périferôl"
172 def insert(offset, fragment)
173 unpacked = self.class.u_unpack(@wrapped_string)
174 unless offset > unpacked.length
175 @wrapped_string.replace(
176 self.class.u_unpack(@wrapped_string).insert(offset, *self.class.u_unpack(fragment)).pack('U*')
177 )
178 else
179 raise IndexError, "index #{offset} out of string"
180 end
181 self
182 end
183
184 # Returns +true+ if contained string contains _other_. Returns +false+ otherwise.
185 #
186 # Example:
187 # 'Café'.mb_chars.include?('é') #=> true
188 def include?(other)
189 # We have to redefine this method because Enumerable defines it.
190 @wrapped_string.include?(other)
191 end
192
193 # Returns the position _needle_ in the string, counting in codepoints. Returns +nil+ if _needle_ isn't found.
194 #
195 # Example:
196 # 'Café périferôl'.mb_chars.index('ô') #=> 12
197 # 'Café périferôl'.mb_chars.index(/\w/u) #=> 0
198 def index(needle, offset=0)
199 wrapped_offset = self.first(offset).wrapped_string.length
200 index = @wrapped_string.index(needle, wrapped_offset)
201 index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil
202 end
203
204 # Returns the position _needle_ in the string, counting in
205 # codepoints, searching backward from _offset_ or the end of the
206 # string. Returns +nil+ if _needle_ isn't found.
207 #
208 # Example:
209 # 'Café périferôl'.mb_chars.rindex('é') #=> 6
210 # 'Café périferôl'.mb_chars.rindex(/\w/u) #=> 13
211 def rindex(needle, offset=nil)
212 offset ||= length
213 wrapped_offset = self.first(offset).wrapped_string.length
214 index = @wrapped_string.rindex(needle, wrapped_offset)
215 index ? (self.class.u_unpack(@wrapped_string.slice(0...index)).size) : nil
216 end
217
218 # Like <tt>String#[]=</tt>, except instead of byte offsets you specify character offsets.
219 #
220 # Example:
221 #
222 # s = "Müller"
223 # s.mb_chars[2] = "e" # Replace character with offset 2
224 # s
225 # #=> "Müeler"
226 #
227 # s = "Müller"
228 # s.mb_chars[1, 2] = "ö" # Replace 2 characters at character offset 1
229 # s
230 # #=> "Möler"
231 def []=(*args)
232 replace_by = args.pop
233 # Indexed replace with regular expressions already works
234 if args.first.is_a?(Regexp)
235 @wrapped_string[*args] = replace_by
236 else
237 result = self.class.u_unpack(@wrapped_string)
238 if args[0].is_a?(Fixnum)
239 raise IndexError, "index #{args[0]} out of string" if args[0] >= result.length
240 min = args[0]
241 max = args[1].nil? ? min : (min + args[1] - 1)
242 range = Range.new(min, max)
243 replace_by = [replace_by].pack('U') if replace_by.is_a?(Fixnum)
244 elsif args.first.is_a?(Range)
245 raise RangeError, "#{args[0]} out of range" if args[0].min >= result.length
246 range = args[0]
247 else
248 needle = args[0].to_s
249 min = index(needle)
250 max = min + self.class.u_unpack(needle).length - 1
251 range = Range.new(min, max)
252 end
253 result[range] = self.class.u_unpack(replace_by)
254 @wrapped_string.replace(result.pack('U*'))
255 end
256 end
257
258 # Works just like <tt>String#rjust</tt>, only integer specifies characters instead of bytes.
259 #
260 # Example:
261 #
262 # "¾ cup".mb_chars.rjust(8).to_s
263 # #=> " ¾ cup"
264 #
265 # "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
266 # #=> " ¾ cup"
267 def rjust(integer, padstr=' ')
268 justify(integer, :right, padstr)
269 end
270
271 # Works just like <tt>String#ljust</tt>, only integer specifies characters instead of bytes.
272 #
273 # Example:
274 #
275 # "¾ cup".mb_chars.rjust(8).to_s
276 # #=> "¾ cup "
277 #
278 # "¾ cup".mb_chars.rjust(8, " ").to_s # Use non-breaking whitespace
279 # #=> "¾ cup "
280 def ljust(integer, padstr=' ')
281 justify(integer, :left, padstr)
282 end
283
284 # Works just like <tt>String#center</tt>, only integer specifies characters instead of bytes.
285 #
286 # Example:
287 #
288 # "¾ cup".mb_chars.center(8).to_s
289 # #=> " ¾ cup "
290 #
291 # "¾ cup".mb_chars.center(8, " ").to_s # Use non-breaking whitespace
292 # #=> " ¾ cup "
293 def center(integer, padstr=' ')
294 justify(integer, :center, padstr)
295 end
296
297 # Strips entire range of Unicode whitespace from the right of the string.
298 def rstrip
299 chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, ''))
300 end
301
302 # Strips entire range of Unicode whitespace from the left of the string.
303 def lstrip
304 chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, ''))
305 end
306
307 # Strips entire range of Unicode whitespace from the right and left of the string.
308 def strip
309 rstrip.lstrip
310 end
311
312 # Returns the number of codepoints in the string
313 def size
314 self.class.u_unpack(@wrapped_string).size
315 end
316 alias_method :length, :size
317
318 # Reverses all characters in the string.
319 #
320 # Example:
321 # 'Café'.mb_chars.reverse.to_s #=> 'éfaC'
322 def reverse
323 chars(self.class.g_unpack(@wrapped_string).reverse.flatten.pack('U*'))
324 end
325
326 # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that
327 # character.
328 #
329 # Example:
330 # 'こんにちは'.mb_chars.slice(2..3).to_s #=> "にち"
331 def slice(*args)
332 if args.size > 2
333 raise ArgumentError, "wrong number of arguments (#{args.size} for 1)" # Do as if we were native
334 elsif (args.size == 2 && !(args.first.is_a?(Numeric) || args.first.is_a?(Regexp)))
335 raise TypeError, "cannot convert #{args.first.class} into Integer" # Do as if we were native
336 elsif (args.size == 2 && !args[1].is_a?(Numeric))
337 raise TypeError, "cannot convert #{args[1].class} into Integer" # Do as if we were native
338 elsif args[0].kind_of? Range
339 cps = self.class.u_unpack(@wrapped_string).slice(*args)
340 result = cps.nil? ? nil : cps.pack('U*')
341 elsif args[0].kind_of? Regexp
342 result = @wrapped_string.slice(*args)
343 elsif args.size == 1 && args[0].kind_of?(Numeric)
344 character = self.class.u_unpack(@wrapped_string)[args[0]]
345 result = character.nil? ? nil : [character].pack('U')
346 else
347 result = self.class.u_unpack(@wrapped_string).slice(*args).pack('U*')
348 end
349 result.nil? ? nil : chars(result)
350 end
351 alias_method :[], :slice
352
353 # Like <tt>String#slice!</tt>, except instead of byte offsets you specify character offsets.
354 #
355 # Example:
356 # s = 'こんにちは'
357 # s.mb_chars.slice!(2..3).to_s #=> "にち"
358 # s #=> "こんは"
359 def slice!(*args)
360 slice = self[*args]
361 self[*args] = ''
362 slice
363 end
364
365 # Returns the codepoint of the first character in the string.
366 #
367 # Example:
368 # 'こんにちは'.mb_chars.ord #=> 12371
369 def ord
370 self.class.u_unpack(@wrapped_string)[0]
371 end
372
373 # Convert characters in the string to uppercase.
374 #
375 # Example:
376 # 'Laurent, òu sont les tests?'.mb_chars.upcase.to_s #=> "LAURENT, ÒU SONT LES TESTS?"
377 def upcase
378 apply_mapping :uppercase_mapping
379 end
380
381 # Convert characters in the string to lowercase.
382 #
383 # Example:
384 # 'VĚDA A VÝZKUM'.mb_chars.downcase.to_s #=> "věda a výzkum"
385 def downcase
386 apply_mapping :lowercase_mapping
387 end
388
389 # Converts the first character to uppercase and the remainder to lowercase.
390 #
391 # Example:
392 # 'über'.mb_chars.capitalize.to_s #=> "Über"
393 def capitalize
394 (slice(0) || chars('')).upcase + (slice(1..-1) || chars('')).downcase
395 end
396
397 # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for
398 # passing strings to databases and validations.
399 #
400 # * <tt>str</tt> - The string to perform normalization on.
401 # * <tt>form</tt> - The form you want to normalize in. Should be one of the following:
402 # <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>. Default is
403 # ActiveSupport::Multibyte.default_normalization_form
404 def normalize(form=ActiveSupport::Multibyte.default_normalization_form)
405 # See http://www.unicode.org/reports/tr15, Table 1
406 codepoints = self.class.u_unpack(@wrapped_string)
407 chars(case form
408 when :d
409 self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints))
410 when :c
411 self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:canonical, codepoints)))
412 when :kd
413 self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints))
414 when :kc
415 self.class.compose_codepoints(self.class.reorder_characters(self.class.decompose_codepoints(:compatability, codepoints)))
416 else
417 raise ArgumentError, "#{form} is not a valid normalization variant", caller
418 end.pack('U*'))
419 end
420
421 # Performs canonical decomposition on all the characters.
422 #
423 # Example:
424 # 'é'.length #=> 2
425 # 'é'.mb_chars.decompose.to_s.length #=> 3
426 def decompose
427 chars(self.class.decompose_codepoints(:canonical, self.class.u_unpack(@wrapped_string)).pack('U*'))
428 end
429
430 # Performs composition on all the characters.
431 #
432 # Example:
433 # 'é'.length #=> 3
434 # 'é'.mb_chars.compose.to_s.length #=> 2
435 def compose
436 chars(self.class.compose_codepoints(self.class.u_unpack(@wrapped_string)).pack('U*'))
437 end
438
439 # Returns the number of grapheme clusters in the string.
440 #
441 # Example:
442 # 'क्षि'.mb_chars.length #=> 4
443 # 'क्षि'.mb_chars.g_length #=> 3
444 def g_length
445 self.class.g_unpack(@wrapped_string).length
446 end
447
448 # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
449 def tidy_bytes
450 chars(self.class.tidy_bytes(@wrapped_string))
451 end
452
453 %w(lstrip rstrip strip reverse upcase downcase tidy_bytes capitalize).each do |method|
454 define_method("#{method}!") do |*args|
455 unless args.nil?
456 @wrapped_string = send(method, *args).to_s
457 else
458 @wrapped_string = send(method).to_s
459 end
460 self
461 end
462 end
463
464 class << self
465
466 # Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't
467 # valid UTF-8.
468 #
469 # Example:
470 # Chars.u_unpack('Café') #=> [67, 97, 102, 233]
471 def u_unpack(string)
472 begin
473 string.unpack 'U*'
474 rescue ArgumentError
475 raise EncodingError, 'malformed UTF-8 character'
476 end
477 end
478
479 # Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified
480 # character class and +false+ otherwise. Valid character classes are: <tt>:cr</tt>, <tt>:lf</tt>, <tt>:l</tt>,
481 # <tt>:v</tt>, <tt>:lv</tt>, <tt>:lvt</tt> and <tt>:t</tt>.
482 #
483 # Primarily used by the grapheme cluster support.
484 def in_char_class?(codepoint, classes)
485 classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false
486 end
487
488 # Unpack the string at grapheme boundaries. Returns a list of character lists.
489 #
490 # Example:
491 # Chars.g_unpack('क्षि') #=> [[2325, 2381], [2359], [2367]]
492 # Chars.g_unpack('Café') #=> [[67], [97], [102], [233]]
493 def g_unpack(string)
494 codepoints = u_unpack(string)
495 unpacked = []
496 pos = 0
497 marker = 0
498 eoc = codepoints.length
499 while(pos < eoc)
500 pos += 1
501 previous = codepoints[pos-1]
502 current = codepoints[pos]
503 if (
504 # CR X LF
505 one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or
506 # L X (L|V|LV|LVT)
507 two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
508 # (LV|V) X (V|T)
509 three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
510 # (LVT|T) X (T)
511 four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or
512 # X Extend
513 five = (UCD.boundary[:extend] === current)
514 )
515 else
516 unpacked << codepoints[marker..pos-1]
517 marker = pos
518 end
519 end
520 unpacked
521 end
522
523 # Reverse operation of g_unpack.
524 #
525 # Example:
526 # Chars.g_pack(Chars.g_unpack('क्षि')) #=> 'क्षि'
527 def g_pack(unpacked)
528 (unpacked.flatten).pack('U*')
529 end
530
531 def padding(padsize, padstr=' ') #:nodoc:
532 if padsize != 0
533 new(padstr * ((padsize / u_unpack(padstr).size) + 1)).slice(0, padsize)
534 else
535 ''
536 end
537 end
538
539 # Re-order codepoints so the string becomes canonical.
540 def reorder_characters(codepoints)
541 length = codepoints.length- 1
542 pos = 0
543 while pos < length do
544 cp1, cp2 = UCD.codepoints[codepoints[pos]], UCD.codepoints[codepoints[pos+1]]
545 if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
546 codepoints[pos..pos+1] = cp2.code, cp1.code
547 pos += (pos > 0 ? -1 : 1)
548 else
549 pos += 1
550 end
551 end
552 codepoints
553 end
554
555 # Decompose composed characters to the decomposed form.
556 def decompose_codepoints(type, codepoints)
557 codepoints.inject([]) do |decomposed, cp|
558 # if it's a hangul syllable starter character
559 if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
560 sindex = cp - HANGUL_SBASE
561 ncp = [] # new codepoints
562 ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
563 ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
564 tindex = sindex % HANGUL_TCOUNT
565 ncp << (HANGUL_TBASE + tindex) unless tindex == 0
566 decomposed.concat ncp
567 # if the codepoint is decomposable in with the current decomposition type
568 elsif (ncp = UCD.codepoints[cp].decomp_mapping) and (!UCD.codepoints[cp].decomp_type || type == :compatability)
569 decomposed.concat decompose_codepoints(type, ncp.dup)
570 else
571 decomposed << cp
572 end
573 end
574 end
575
576 # Compose decomposed characters to the composed form.
577 def compose_codepoints(codepoints)
578 pos = 0
579 eoa = codepoints.length - 1
580 starter_pos = 0
581 starter_char = codepoints[0]
582 previous_combining_class = -1
583 while pos < eoa
584 pos += 1
585 lindex = starter_char - HANGUL_LBASE
586 # -- Hangul
587 if 0 <= lindex and lindex < HANGUL_LCOUNT
588 vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
589 if 0 <= vindex and vindex < HANGUL_VCOUNT
590 tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
591 if 0 <= tindex and tindex < HANGUL_TCOUNT
592 j = starter_pos + 2
593 eoa -= 2
594 else
595 tindex = 0
596 j = starter_pos + 1
597 eoa -= 1
598 end
599 codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
600 end
601 starter_pos += 1
602 starter_char = codepoints[starter_pos]
603 # -- Other characters
604 else
605 current_char = codepoints[pos]
606 current = UCD.codepoints[current_char]
607 if current.combining_class > previous_combining_class
608 if ref = UCD.composition_map[starter_char]
609 composition = ref[current_char]
610 else
611 composition = nil
612 end
613 unless composition.nil?
614 codepoints[starter_pos] = composition
615 starter_char = composition
616 codepoints.delete_at pos
617 eoa -= 1
618 pos -= 1
619 previous_combining_class = -1
620 else
621 previous_combining_class = current.combining_class
622 end
623 else
624 previous_combining_class = current.combining_class
625 end
626 if current.combining_class == 0
627 starter_pos = pos
628 starter_char = codepoints[pos]
629 end
630 end
631 end
632 codepoints
633 end
634
635 # Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.
636 def tidy_bytes(string)
637 string.split(//u).map do |c|
638 c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding)
639
640 if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c)
641 n = c.unpack('C')[0]
642 n < 128 ? n.chr :
643 n < 160 ? [UCD.cp1252[n] || n].pack('U') :
644 n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
645 else
646 c
647 end
648 end.join
649 end
650 end
651
652 protected
653
654 def translate_offset(byte_offset) #:nodoc:
655 return nil if byte_offset.nil?
656 return 0 if @wrapped_string == ''
657 chunk = @wrapped_string[0..byte_offset]
658 begin
659 begin
660 chunk.unpack('U*').length - 1
661 rescue ArgumentError => e
662 chunk = @wrapped_string[0..(byte_offset+=1)]
663 # Stop retrying at the end of the string
664 raise e unless byte_offset < chunk.length
665 # We damaged a character, retry
666 retry
667 end
668 # Catch the ArgumentError so we can throw our own
669 rescue ArgumentError
670 raise EncodingError, 'malformed UTF-8 character'
671 end
672 end
673
674 def justify(integer, way, padstr=' ') #:nodoc:
675 raise ArgumentError, "zero width padding" if padstr.length == 0
676 padsize = integer - size
677 padsize = padsize > 0 ? padsize : 0
678 case way
679 when :right
680 result = @wrapped_string.dup.insert(0, self.class.padding(padsize, padstr))
681 when :left
682 result = @wrapped_string.dup.insert(-1, self.class.padding(padsize, padstr))
683 when :center
684 lpad = self.class.padding((padsize / 2.0).floor, padstr)
685 rpad = self.class.padding((padsize / 2.0).ceil, padstr)
686 result = @wrapped_string.dup.insert(0, lpad).insert(-1, rpad)
687 end
688 chars(result)
689 end
690
691 def apply_mapping(mapping) #:nodoc:
692 chars(self.class.u_unpack(@wrapped_string).map do |codepoint|
693 cp = UCD.codepoints[codepoint]
694 if cp and (ncp = cp.send(mapping)) and ncp > 0
695 ncp
696 else
697 codepoint
698 end
699 end.pack('U*'))
700 end
701
702 def chars(string) #:nodoc:
703 self.class.new(string)
704 end
705 end
706 end
707 end