You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

  1. #!/usr/local/bin/ruby -w
  2. # = faster_csv.rb -- Faster CSV Reading and Writing
  3. #
  4. # Created by James Edward Gray II on 2005-10-31.
  5. # Copyright 2005 Gray Productions. All rights reserved.
  6. #
  7. # See FasterCSV for documentation.
  8. if RUBY_VERSION >= "1.9"
  9. abort <<-VERSION_WARNING.gsub(/^\s+/, "")
  10. Please switch to Ruby 1.9's standard CSV library. It's FasterCSV plus
  11. support for Ruby 1.9's m17n encoding engine.
  13. end
  14. require "forwardable"
  15. require "English"
  16. require "enumerator"
  17. require "date"
  18. require "stringio"
  19. #
  20. # This class provides a complete interface to CSV files and data. It offers
  21. # tools to enable you to read and write to and from Strings or IO objects, as
  22. # needed.
  23. #
  24. # == Reading
  25. #
  26. # === From a File
  27. #
  28. # ==== A Line at a Time
  29. #
  30. # FasterCSV.foreach("path/to/file.csv") do |row|
  31. # # use row here...
  32. # end
  33. #
  34. # ==== All at Once
  35. #
  36. # arr_of_arrs ="path/to/file.csv")
  37. #
  38. # === From a String
  39. #
  40. # ==== A Line at a Time
  41. #
  42. # FasterCSV.parse("CSV,data,String") do |row|
  43. # # use row here...
  44. # end
  45. #
  46. # ==== All at Once
  47. #
  48. # arr_of_arrs = FasterCSV.parse("CSV,data,String")
  49. #
  50. # == Writing
  51. #
  52. # === To a File
  53. #
  54. #"path/to/file.csv", "w") do |csv|
  55. # csv << ["row", "of", "CSV", "data"]
  56. # csv << ["another", "row"]
  57. # # ...
  58. # end
  59. #
  60. # === To a String
  61. #
  62. # csv_string = FasterCSV.generate do |csv|
  63. # csv << ["row", "of", "CSV", "data"]
  64. # csv << ["another", "row"]
  65. # # ...
  66. # end
  67. #
  68. # == Convert a Single Line
  69. #
  70. # csv_string = ["CSV", "data"].to_csv # to CSV
  71. # csv_array = "CSV,String".parse_csv # from CSV
  72. #
  73. # == Shortcut Interface
  74. #
  75. # FCSV { |csv_out| csv_out << %w{my data here} } # to $stdout
  76. # FCSV(csv = "") { |csv_str| csv_str << %w{my data here} } # to a String
  77. # FCSV($stderr) { |csv_err| csv_err << %w{my data here} } # to $stderr
  78. #
  79. class FasterCSV
  80. # The version of the installed library.
  81. VERSION = "1.5.0".freeze
  82. #
  83. # A FasterCSV::Row is part Array and part Hash. It retains an order for the
  84. # fields and allows duplicates just as an Array would, but also allows you to
  85. # access fields by name just as you could if they were in a Hash.
  86. #
  87. # All rows returned by FasterCSV will be constructed from this class, if
  88. # header row processing is activated.
  89. #
  90. class Row
  91. #
  92. # Construct a new FasterCSV::Row from +headers+ and +fields+, which are
  93. # expected to be Arrays. If one Array is shorter than the other, it will be
  94. # padded with +nil+ objects.
  95. #
  96. # The optional +header_row+ parameter can be set to +true+ to indicate, via
  97. # FasterCSV::Row.header_row?() and FasterCSV::Row.field_row?(), that this is
  98. # a header row. Otherwise, the row is assumes to be a field row.
  99. #
  100. # A FasterCSV::Row object supports the following Array methods through
  101. # delegation:
  102. #
  103. # * empty?()
  104. # * length()
  105. # * size()
  106. #
  107. def initialize(headers, fields, header_row = false)
  108. @header_row = header_row
  109. # handle extra headers or fields
  110. @row = if headers.size > fields.size
  112. else
  113. { |pair| pair.reverse }
  114. end
  115. end
  116. # Internal data format used to compare equality.
  117. attr_reader :row
  118. protected :row
  119. ### Array Delegation ###
  120. extend Forwardable
  121. def_delegators :@row, :empty?, :length, :size
  122. # Returns +true+ if this is a header row.
  123. def header_row?
  124. @header_row
  125. end
  126. # Returns +true+ if this is a field row.
  127. def field_row?
  128. not header_row?
  129. end
  130. # Returns the headers of this row.
  131. def headers
  132. { |pair| pair.first }
  133. end
  134. #
  135. # :call-seq:
  136. # field( header )
  137. # field( header, offset )
  138. # field( index )
  139. #
  140. # This method will fetch the field value by +header+ or +index+. If a field
  141. # is not found, +nil+ is returned.
  142. #
  143. # When provided, +offset+ ensures that a header match occurrs on or later
  144. # than the +offset+ index. You can use this to find duplicate headers,
  145. # without resorting to hard-coding exact indices.
  146. #
  147. def field(header_or_index, minimum_index = 0)
  148. # locate the pair
  149. finder = header_or_index.is_a?(Integer) ? :[] : :assoc
  150. pair = @row[minimum_index..-1].send(finder, header_or_index)
  151. # return the field if we have a pair
  152. pair.nil? ? nil : pair.last
  153. end
  154. alias_method :[], :field
  155. #
  156. # :call-seq:
  157. # []=( header, value )
  158. # []=( header, offset, value )
  159. # []=( index, value )
  160. #
  161. # Looks up the field by the semantics described in FasterCSV::Row.field()
  162. # and assigns the +value+.
  163. #
  164. # Assigning past the end of the row with an index will set all pairs between
  165. # to <tt>[nil, nil]</tt>. Assigning to an unused header appends the new
  166. # pair.
  167. #
  168. def []=(*args)
  169. value = args.pop
  170. if args.first.is_a? Integer
  171. if @row[args.first].nil? # extending past the end with index
  172. @row[args.first] = [nil, value]
  173.! { |pair| pair.nil? ? [nil, nil] : pair }
  174. else # normal index assignment
  175. @row[args.first][1] = value
  176. end
  177. else
  178. index = index(*args)
  179. if index.nil? # appending a field
  180. self << [args.first, value]
  181. else # normal header assignment
  182. @row[index][1] = value
  183. end
  184. end
  185. end
  186. #
  187. # :call-seq:
  188. # <<( field )
  189. # <<( header_and_field_array )
  190. # <<( header_and_field_hash )
  191. #
  192. # If a two-element Array is provided, it is assumed to be a header and field
  193. # and the pair is appended. A Hash works the same way with the key being
  194. # the header and the value being the field. Anything else is assumed to be
  195. # a lone field which is appended with a +nil+ header.
  196. #
  197. # This method returns the row for chaining.
  198. #
  199. def <<(arg)
  200. if arg.is_a?(Array) and arg.size == 2 # appending a header and name
  201. @row << arg
  202. elsif arg.is_a?(Hash) # append header and name pairs
  203. arg.each { |pair| @row << pair }
  204. else # append field value
  205. @row << [nil, arg]
  206. end
  207. self # for chaining
  208. end
  209. #
  210. # A shortcut for appending multiple fields. Equivalent to:
  211. #
  212. # args.each { |arg| faster_csv_row << arg }
  213. #
  214. # This method returns the row for chaining.
  215. #
  216. def push(*args)
  217. args.each { |arg| self << arg }
  218. self # for chaining
  219. end
  220. #
  221. # :call-seq:
  222. # delete( header )
  223. # delete( header, offset )
  224. # delete( index )
  225. #
  226. # Used to remove a pair from the row by +header+ or +index+. The pair is
  227. # located as described in FasterCSV::Row.field(). The deleted pair is
  228. # returned, or +nil+ if a pair could not be found.
  229. #
  230. def delete(header_or_index, minimum_index = 0)
  231. if header_or_index.is_a? Integer # by index
  232. @row.delete_at(header_or_index)
  233. else # by header
  234. @row.delete_at(index(header_or_index, minimum_index))
  235. end
  236. end
  237. #
  238. # The provided +block+ is passed a header and field for each pair in the row
  239. # and expected to return +true+ or +false+, depending on whether the pair
  240. # should be deleted.
  241. #
  242. # This method returns the row for chaining.
  243. #
  244. def delete_if(&block)
  245. @row.delete_if(&block)
  246. self # for chaining
  247. end
  248. #
  249. # This method accepts any number of arguments which can be headers, indices,
  250. # Ranges of either, or two-element Arrays containing a header and offset.
  251. # Each argument will be replaced with a field lookup as described in
  252. # FasterCSV::Row.field().
  253. #
  254. # If called with no arguments, all fields are returned.
  255. #
  256. def fields(*headers_and_or_indices)
  257. if headers_and_or_indices.empty? # return all fields--no arguments
  258. { |pair| pair.last }
  259. else # or work like values_at()
  260. headers_and_or_indices.inject( do |all, h_or_i|
  261. all + if h_or_i.is_a? Range
  262. index_begin = h_or_i.begin.is_a?(Integer) ? h_or_i.begin :
  263. index(h_or_i.begin)
  264. index_end = h_or_i.end.is_a?(Integer) ? h_or_i.end :
  265. index(h_or_i.end)
  266. new_range = h_or_i.exclude_end? ? (index_begin...index_end) :
  267. (index_begin..index_end)
  268. fields.values_at(new_range)
  269. else
  270. [field(*Array(h_or_i))]
  271. end
  272. end
  273. end
  274. end
  275. alias_method :values_at, :fields
  276. #
  277. # :call-seq:
  278. # index( header )
  279. # index( header, offset )
  280. #
  281. # This method will return the index of a field with the provided +header+.
  282. # The +offset+ can be used to locate duplicate header names, as described in
  283. # FasterCSV::Row.field().
  284. #
  285. def index(header, minimum_index = 0)
  286. # find the pair
  287. index = headers[minimum_index..-1].index(header)
  288. # return the index at the right offset, if we found one
  289. index.nil? ? nil : index + minimum_index
  290. end
  291. # Returns +true+ if +name+ is a header for this row, and +false+ otherwise.
  292. def header?(name)
  293. headers.include? name
  294. end
  295. alias_method :include?, :header?
  296. #
  297. # Returns +true+ if +data+ matches a field in this row, and +false+
  298. # otherwise.
  299. #
  300. def field?(data)
  301. fields.include? data
  302. end
  303. include Enumerable
  304. #
  305. # Yields each pair of the row as header and field tuples (much like
  306. # iterating over a Hash).
  307. #
  308. # Support for Enumerable.
  309. #
  310. # This method returns the row for chaining.
  311. #
  312. def each(&block)
  313. @row.each(&block)
  314. self # for chaining
  315. end
  316. #
  317. # Returns +true+ if this row contains the same headers and fields in the
  318. # same order as +other+.
  319. #
  320. def ==(other)
  321. @row == other.row
  322. end
  323. #
  324. # Collapses the row into a simple Hash. Be warning that this discards field
  325. # order and clobbers duplicate fields.
  326. #
  327. def to_hash
  328. # flatten just one level of the internal Array
  329. Hash[*@row.inject( { |ary, pair| ary.push(*pair) }]
  330. end
  331. #
  332. # Returns the row as a CSV String. Headers are not used. Equivalent to:
  333. #
  334. # faster_csv_row.fields.to_csv( options )
  335. #
  336. def to_csv(options =
  337. fields.to_csv(options)
  338. end
  339. alias_method :to_s, :to_csv
  340. # A summary of fields, by header.
  341. def inspect
  342. str = "#<#{self.class}"
  343. each do |header, field|
  344. str << " #{header.is_a?(Symbol) ? header.to_s : header.inspect}:" <<
  345. field.inspect
  346. end
  347. str << ">"
  348. end
  349. end
  350. #
  351. # A FasterCSV::Table is a two-dimensional data structure for representing CSV
  352. # documents. Tables allow you to work with the data by row or column,
  353. # manipulate the data, and even convert the results back to CSV, if needed.
  354. #
  355. # All tables returned by FasterCSV will be constructed from this class, if
  356. # header row processing is activated.
  357. #
  358. class Table
  359. #
  360. # Construct a new FasterCSV::Table from +array_of_rows+, which are expected
  361. # to be FasterCSV::Row objects. All rows are assumed to have the same
  362. # headers.
  363. #
  364. # A FasterCSV::Table object supports the following Array methods through
  365. # delegation:
  366. #
  367. # * empty?()
  368. # * length()
  369. # * size()
  370. #
  371. def initialize(array_of_rows)
  372. @table = array_of_rows
  373. @mode = :col_or_row
  374. end
  375. # The current access mode for indexing and iteration.
  376. attr_reader :mode
  377. # Internal data format used to compare equality.
  378. attr_reader :table
  379. protected :table
  380. ### Array Delegation ###
  381. extend Forwardable
  382. def_delegators :@table, :empty?, :length, :size
  383. #
  384. # Returns a duplicate table object, in column mode. This is handy for
  385. # chaining in a single call without changing the table mode, but be aware
  386. # that this method can consume a fair amount of memory for bigger data sets.
  387. #
  388. # This method returns the duplicate table for chaining. Don't chain
  389. # destructive methods (like []=()) this way though, since you are working
  390. # with a duplicate.
  391. #
  392. def by_col
  394. end
  395. #
  396. # Switches the mode of this table to column mode. All calls to indexing and
  397. # iteration methods will work with columns until the mode is changed again.
  398. #
  399. # This method returns the table and is safe to chain.
  400. #
  401. def by_col!
  402. @mode = :col
  403. self
  404. end
  405. #
  406. # Returns a duplicate table object, in mixed mode. This is handy for
  407. # chaining in a single call without changing the table mode, but be aware
  408. # that this method can consume a fair amount of memory for bigger data sets.
  409. #
  410. # This method returns the duplicate table for chaining. Don't chain
  411. # destructive methods (like []=()) this way though, since you are working
  412. # with a duplicate.
  413. #
  414. def by_col_or_row
  416. end
  417. #
  418. # Switches the mode of this table to mixed mode. All calls to indexing and
  419. # iteration methods will use the default intelligent indexing system until
  420. # the mode is changed again. In mixed mode an index is assumed to be a row
  421. # reference while anything else is assumed to be column access by headers.
  422. #
  423. # This method returns the table and is safe to chain.
  424. #
  425. def by_col_or_row!
  426. @mode = :col_or_row
  427. self
  428. end
  429. #
  430. # Returns a duplicate table object, in row mode. This is handy for chaining
  431. # in a single call without changing the table mode, but be aware that this
  432. # method can consume a fair amount of memory for bigger data sets.
  433. #
  434. # This method returns the duplicate table for chaining. Don't chain
  435. # destructive methods (like []=()) this way though, since you are working
  436. # with a duplicate.
  437. #
  438. def by_row
  440. end
  441. #
  442. # Switches the mode of this table to row mode. All calls to indexing and
  443. # iteration methods will work with rows until the mode is changed again.
  444. #
  445. # This method returns the table and is safe to chain.
  446. #
  447. def by_row!
  448. @mode = :row
  449. self
  450. end
  451. #
  452. # Returns the headers for the first row of this table (assumed to match all
  453. # other rows). An empty Array is returned for empty tables.
  454. #
  455. def headers
  456. if @table.empty?
  458. else
  459. @table.first.headers
  460. end
  461. end
  462. #
  463. # In the default mixed mode, this method returns rows for index access and
  464. # columns for header access. You can force the index association by first
  465. # calling by_col!() or by_row!().
  466. #
  467. # Columns are returned as an Array of values. Altering that Array has no
  468. # effect on the table.
  469. #
  470. def [](index_or_header)
  471. if @mode == :row or # by index
  472. (@mode == :col_or_row and index_or_header.is_a? Integer)
  473. @table[index_or_header]
  474. else # by header
  475. { |row| row[index_or_header] }
  476. end
  477. end
  478. #
  479. # In the default mixed mode, this method assigns rows for index access and
  480. # columns for header access. You can force the index association by first
  481. # calling by_col!() or by_row!().
  482. #
  483. # Rows may be set to an Array of values (which will inherit the table's
  484. # headers()) or a FasterCSV::Row.
  485. #
  486. # Columns may be set to a single value, which is copied to each row of the
  487. # column, or an Array of values. Arrays of values are assigned to rows top
  488. # to bottom in row major order. Excess values are ignored and if the Array
  489. # does not have a value for each row the extra rows will receive a +nil+.
  490. #
  491. # Assigning to an existing column or row clobbers the data. Assigning to
  492. # new columns creates them at the right end of the table.
  493. #
  494. def []=(index_or_header, value)
  495. if @mode == :row or # by index
  496. (@mode == :col_or_row and index_or_header.is_a? Integer)
  497. if value.is_a? Array
  498. @table[index_or_header] =, value)
  499. else
  500. @table[index_or_header] = value
  501. end
  502. else # set column
  503. if value.is_a? Array # multiple values
  504. @table.each_with_index do |row, i|
  505. if row.header_row?
  506. row[index_or_header] = index_or_header
  507. else
  508. row[index_or_header] = value[i]
  509. end
  510. end
  511. else # repeated value
  512. @table.each do |row|
  513. if row.header_row?
  514. row[index_or_header] = index_or_header
  515. else
  516. row[index_or_header] = value
  517. end
  518. end
  519. end
  520. end
  521. end
  522. #
  523. # The mixed mode default is to treat a list of indices as row access,
  524. # returning the rows indicated. Anything else is considered columnar
  525. # access. For columnar access, the return set has an Array for each row
  526. # with the values indicated by the headers in each Array. You can force
  527. # column or row mode using by_col!() or by_row!().
  528. #
  529. # You cannot mix column and row access.
  530. #
  531. def values_at(*indices_or_headers)
  532. if @mode == :row or # by indices
  533. ( @mode == :col_or_row and indices_or_headers.all? do |index|
  534. index.is_a?(Integer) or
  535. ( index.is_a?(Range) and
  536. index.first.is_a?(Integer) and
  537. index.last.is_a?(Integer) )
  538. end )
  539. @table.values_at(*indices_or_headers)
  540. else # by headers
  541. { |row| row.values_at(*indices_or_headers) }
  542. end
  543. end
  544. #
  545. # Adds a new row to the bottom end of this table. You can provide an Array,
  546. # which will be converted to a FasterCSV::Row (inheriting the table's
  547. # headers()), or a FasterCSV::Row.
  548. #
  549. # This method returns the table for chaining.
  550. #
  551. def <<(row_or_array)
  552. if row_or_array.is_a? Array # append Array
  553. @table <<, row_or_array)
  554. else # append Row
  555. @table << row_or_array
  556. end
  557. self # for chaining
  558. end
  559. #
  560. # A shortcut for appending multiple rows. Equivalent to:
  561. #
  562. # rows.each { |row| self << row }
  563. #
  564. # This method returns the table for chaining.
  565. #
  566. def push(*rows)
  567. rows.each { |row| self << row }
  568. self # for chaining
  569. end
  570. #
  571. # Removes and returns the indicated column or row. In the default mixed
  572. # mode indices refer to rows and everything else is assumed to be a column
  573. # header. Use by_col!() or by_row!() to force the lookup.
  574. #
  575. def delete(index_or_header)
  576. if @mode == :row or # by index
  577. (@mode == :col_or_row and index_or_header.is_a? Integer)
  578. @table.delete_at(index_or_header)
  579. else # by header
  580. { |row| row.delete(index_or_header).last }
  581. end
  582. end
  583. #
  584. # Removes any column or row for which the block returns +true+. In the
  585. # default mixed mode or row mode, iteration is the standard row major
  586. # walking of rows. In column mode, interation will +yield+ two element
  587. # tuples containing the column name and an Array of values for that column.
  588. #
  589. # This method returns the table for chaining.
  590. #
  591. def delete_if(&block)
  592. if @mode == :row or @mode == :col_or_row # by index
  593. @table.delete_if(&block)
  594. else # by header
  595. to_delete =
  596. headers.each_with_index do |header, i|
  597. to_delete << header if block[[header, self[header]]]
  598. end
  599. { |header| delete(header) }
  600. end
  601. self # for chaining
  602. end
  603. include Enumerable
  604. #
  605. # In the default mixed mode or row mode, iteration is the standard row major
  606. # walking of rows. In column mode, interation will +yield+ two element
  607. # tuples containing the column name and an Array of values for that column.
  608. #
  609. # This method returns the table for chaining.
  610. #
  611. def each(&block)
  612. if @mode == :col
  613. headers.each { |header| block[[header, self[header]]] }
  614. else
  615. @table.each(&block)
  616. end
  617. self # for chaining
  618. end
  619. # Returns +true+ if all rows of this table ==() +other+'s rows.
  620. def ==(other)
  621. @table == other.table
  622. end
  623. #
  624. # Returns the table as an Array of Arrays. Headers will be the first row,
  625. # then all of the field rows will follow.
  626. #
  627. def to_a
  628. @table.inject([headers]) do |array, row|
  629. if row.header_row?
  630. array
  631. else
  632. array + [row.fields]
  633. end
  634. end
  635. end
  636. #
  637. # Returns the table as a complete CSV String. Headers will be listed first,
  638. # then all of the field rows.
  639. #
  640. def to_csv(options =
  641. @table.inject([headers.to_csv(options)]) do |rows, row|
  642. if row.header_row?
  643. rows
  644. else
  645. rows + [row.fields.to_csv(options)]
  646. end
  647. end.join
  648. end
  649. alias_method :to_s, :to_csv
  650. def inspect
  651. "#<#{self.class} mode:#{@mode} row_count:#{to_a.size}>"
  652. end
  653. end
  654. # The error thrown when the parser encounters illegal CSV formatting.
  655. class MalformedCSVError < RuntimeError; end
  656. #
  657. # A FieldInfo Struct contains details about a field's position in the data
  658. # source it was read from. FasterCSV will pass this Struct to some blocks
  659. # that make decisions based on field structure. See
  660. # FasterCSV.convert_fields() for an example.
  661. #
  662. # <b><tt>index</tt></b>:: The zero-based index of the field in its row.
  663. # <b><tt>line</tt></b>:: The line of the data source this row is from.
  664. # <b><tt>header</tt></b>:: The header for the column, when available.
  665. #
  666. FieldInfo =, :line, :header)
  667. # A Regexp used to find and convert some common Date formats.
  668. DateMatcher = / \A(?: (\w+,?\s+)?\w+\s+\d{1,2},?\s+\d{2,4} |
  669. \d{4}-\d{2}-\d{2} )\z /x
  670. # A Regexp used to find and convert some common DateTime formats.
  671. DateTimeMatcher =
  672. / \A(?: (\w+,?\s+)?\w+\s+\d{1,2}\s+\d{1,2}:\d{1,2}:\d{1,2},?\s+\d{2,4} |
  673. \d{4}-\d{2}-\d{2}\s\d{2}:\d{2}:\d{2} )\z /x
  674. #
  675. # This Hash holds the built-in converters of FasterCSV that can be accessed by
  676. # name. You can select Converters with FasterCSV.convert() or through the
  677. # +options+ Hash passed to FasterCSV::new().
  678. #
  679. # <b><tt>:integer</tt></b>:: Converts any field Integer() accepts.
  680. # <b><tt>:float</tt></b>:: Converts any field Float() accepts.
  681. # <b><tt>:numeric</tt></b>:: A combination of <tt>:integer</tt>
  682. # and <tt>:float</tt>.
  683. # <b><tt>:date</tt></b>:: Converts any field Date::parse() accepts.
  684. # <b><tt>:date_time</tt></b>:: Converts any field DateTime::parse() accepts.
  685. # <b><tt>:all</tt></b>:: All built-in converters. A combination of
  686. # <tt>:date_time</tt> and <tt>:numeric</tt>.
  687. #
  688. # This Hash is intetionally left unfrozen and users should feel free to add
  689. # values to it that can be accessed by all FasterCSV objects.
  690. #
  691. # To add a combo field, the value should be an Array of names. Combo fields
  692. # can be nested with other combo fields.
  693. #
  694. Converters = { :integer => lambda { |f| Integer(f) rescue f },
  695. :float => lambda { |f| Float(f) rescue f },
  696. :numeric => [:integer, :float],
  697. :date => lambda { |f|
  698. f =~ DateMatcher ? (Date.parse(f) rescue f) : f
  699. },
  700. :date_time => lambda { |f|
  701. f =~ DateTimeMatcher ? (DateTime.parse(f) rescue f) : f
  702. },
  703. :all => [:date_time, :numeric] }
  704. #
  705. # This Hash holds the built-in header converters of FasterCSV that can be
  706. # accessed by name. You can select HeaderConverters with
  707. # FasterCSV.header_convert() or through the +options+ Hash passed to
  708. # FasterCSV::new().
  709. #
  710. # <b><tt>:downcase</tt></b>:: Calls downcase() on the header String.
  711. # <b><tt>:symbol</tt></b>:: The header String is downcased, spaces are
  712. # replaced with underscores, non-word characters
  713. # are dropped, and finally to_sym() is called.
  714. #
  715. # This Hash is intetionally left unfrozen and users should feel free to add
  716. # values to it that can be accessed by all FasterCSV objects.
  717. #
  718. # To add a combo field, the value should be an Array of names. Combo fields
  719. # can be nested with other combo fields.
  720. #
  721. HeaderConverters = {
  722. :downcase => lambda { |h| h.downcase },
  723. :symbol => lambda { |h|
  724." ", "_").delete("^a-z0-9_").to_sym
  725. }
  726. }
  727. #
  728. # The options used when no overrides are given by calling code. They are:
  729. #
  730. # <b><tt>:col_sep</tt></b>:: <tt>","</tt>
  731. # <b><tt>:row_sep</tt></b>:: <tt>:auto</tt>
  732. # <b><tt>:quote_char</tt></b>:: <tt>'"'</tt>
  733. # <b><tt>:converters</tt></b>:: +nil+
  734. # <b><tt>:unconverted_fields</tt></b>:: +nil+
  735. # <b><tt>:headers</tt></b>:: +false+
  736. # <b><tt>:return_headers</tt></b>:: +false+
  737. # <b><tt>:header_converters</tt></b>:: +nil+
  738. # <b><tt>:skip_blanks</tt></b>:: +false+
  739. # <b><tt>:force_quotes</tt></b>:: +false+
  740. #
  741. DEFAULT_OPTIONS = { :col_sep => ",",
  742. :row_sep => :auto,
  743. :quote_char => '"',
  744. :converters => nil,
  745. :unconverted_fields => nil,
  746. :headers => false,
  747. :return_headers => false,
  748. :header_converters => nil,
  749. :skip_blanks => false,
  750. :force_quotes => false }.freeze
  751. #
  752. # This method will build a drop-in replacement for many of the standard CSV
  753. # methods. It allows you to write code like:
  754. #
  755. # begin
  756. # require "faster_csv"
  757. # FasterCSV.build_csv_interface
  758. # rescue LoadError
  759. # require "csv"
  760. # end
  761. # # ... use CSV here ...
  762. #
  763. # This is not a complete interface with completely identical behavior.
  764. # However, it is intended to be close enough that you won't notice the
  765. # difference in most cases. CSV methods supported are:
  766. #
  767. # * foreach()
  768. # * generate_line()
  769. # * open()
  770. # * parse()
  771. # * parse_line()
  772. # * readlines()
  773. #
  774. # Be warned that this interface is slower than vanilla FasterCSV due to the
  775. # extra layer of method calls. Depending on usage, this can slow it down to
  776. # near CSV speeds.
  777. #
  778. def self.build_csv_interface
  779. Object.const_set(:CSV, do
  780. def self.foreach(path, rs = :auto, &block) # :nodoc:
  781. FasterCSV.foreach(path, :row_sep => rs, &block)
  782. end
  783. def self.generate_line(row, fs = ",", rs = "") # :nodoc:
  784. FasterCSV.generate_line(row, :col_sep => fs, :row_sep => rs)
  785. end
  786. def, mode, fs = ",", rs = :auto, &block) # :nodoc:
  787. if block and mode.include? "r"
  788., mode, :col_sep => fs, :row_sep => rs) do |csv|
  789. csv.each(&block)
  790. end
  791. else
  792., mode, :col_sep => fs, :row_sep => rs, &block)
  793. end
  794. end
  795. def self.parse(str_or_readable, fs = ",", rs = :auto, &block) # :nodoc:
  796. FasterCSV.parse(str_or_readable, :col_sep => fs, :row_sep => rs, &block)
  797. end
  798. def self.parse_line(src, fs = ",", rs = :auto) # :nodoc:
  799. FasterCSV.parse_line(src, :col_sep => fs, :row_sep => rs)
  800. end
  801. def self.readlines(path, rs = :auto) # :nodoc:
  802. FasterCSV.readlines(path, :row_sep => rs)
  803. end
  804. end
  805. end
  806. #
  807. # This method allows you to serialize an Array of Ruby objects to a String or
  808. # File of CSV data. This is not as powerful as Marshal or YAML, but perhaps
  809. # useful for spreadsheet and database interaction.
  810. #
  811. # Out of the box, this method is intended to work with simple data objects or
  812. # Structs. It will serialize a list of instance variables and/or
  813. # Struct.members().
  814. #
  815. # If you need need more complicated serialization, you can control the process
  816. # by adding methods to the class to be serialized.
  817. #
  818. # A class method csv_meta() is responsible for returning the first row of the
  819. # document (as an Array). This row is considered to be a Hash of the form
  820. # key_1,value_1,key_2,value_2,... FasterCSV::load() expects to find a class
  821. # key with a value of the stringified class name and FasterCSV::dump() will
  822. # create this, if you do not define this method. This method is only called
  823. # on the first object of the Array.
  824. #
  825. # The next method you can provide is an instance method called csv_headers().
  826. # This method is expected to return the second line of the document (again as
  827. # an Array), which is to be used to give each column a header. By default,
  828. # FasterCSV::load() will set an instance variable if the field header starts
  829. # with an @ character or call send() passing the header as the method name and
  830. # the field value as an argument. This method is only called on the first
  831. # object of the Array.
  832. #
  833. # Finally, you can provide an instance method called csv_dump(), which will
  834. # be passed the headers. This should return an Array of fields that can be
  835. # serialized for this object. This method is called once for every object in
  836. # the Array.
  837. #
  838. # The +io+ parameter can be used to serialize to a File, and +options+ can be
  839. # anything FasterCSV::new() accepts.
  840. #
  841. def self.dump(ary_of_objs, io = "", options =
  842. obj_template = ary_of_objs.first
  843. csv =, options)
  844. # write meta information
  845. begin
  846. csv << obj_template.class.csv_meta
  847. rescue NoMethodError
  848. csv << [:class, obj_template.class]
  849. end
  850. # write headers
  851. begin
  852. headers = obj_template.csv_headers
  853. rescue NoMethodError
  854. headers = obj_template.instance_variables.sort
  855. if obj_template.class.ancestors.find { |cls| cls.to_s =~ /\AStruct\b/ }
  856. headers += { |mem| "#{mem}=" }.sort
  857. end
  858. end
  859. csv << headers
  860. # serialize each object
  861. ary_of_objs.each do |obj|
  862. begin
  863. csv << obj.csv_dump(headers)
  864. rescue NoMethodError
  865. csv << do |var|
  866. if var[0] == ?@
  867. obj.instance_variable_get(var)
  868. else
  869. obj[var[0..-2]]
  870. end
  871. end
  872. end
  873. end
  874. if io.is_a? String
  875. csv.string
  876. else
  877. csv.close
  878. end
  879. end
  880. #
  881. # :call-seq:
  882. # filter( options = ) { |row| ... }
  883. # filter( input, options = ) { |row| ... }
  884. # filter( input, output, options = ) { |row| ... }
  885. #
  886. # This method is a convenience for building Unix-like filters for CSV data.
  887. # Each row is yielded to the provided block which can alter it as needed.
  888. # After the block returns, the row is appended to +output+ altered or not.
  889. #
  890. # The +input+ and +output+ arguments can be anything FasterCSV::new() accepts
  891. # (generally String or IO objects). If not given, they default to
  892. # <tt>ARGF</tt> and <tt>$stdout</tt>.
  893. #
  894. # The +options+ parameter is also filtered down to FasterCSV::new() after some
  895. # clever key parsing. Any key beginning with <tt>:in_</tt> or
  896. # <tt>:input_</tt> will have that leading identifier stripped and will only
  897. # be used in the +options+ Hash for the +input+ object. Keys starting with
  898. # <tt>:out_</tt> or <tt>:output_</tt> affect only +output+. All other keys
  899. # are assigned to both objects.
  900. #
  901. # The <tt>:output_row_sep</tt> +option+ defaults to
  902. # <tt>$INPUT_RECORD_SEPARATOR</tt> (<tt>$/</tt>).
  903. #
  904. def self.filter(*args)
  905. # parse options for input, output, or both
  906. in_options, out_options =, {:row_sep => $INPUT_RECORD_SEPARATOR}
  907. if args.last.is_a? Hash
  908. args.pop.each do |key, value|
  909. case key.to_s
  910. when /\Ain(?:put)?_(.+)\Z/
  911. in_options[$1.to_sym] = value
  912. when /\Aout(?:put)?_(.+)\Z/
  913. out_options[$1.to_sym] = value
  914. else
  915. in_options[key] = value
  916. out_options[key] = value
  917. end
  918. end
  919. end
  920. # build input and output wrappers
  921. input = || ARGF, in_options)
  922. output = || $stdout, out_options)
  923. # read, yield, write
  924. input.each do |row|
  925. yield row
  926. output << row
  927. end
  928. end
  929. #
  930. # This method is intended as the primary interface for reading CSV files. You
  931. # pass a +path+ and any +options+ you wish to set for the read. Each row of
  932. # file will be passed to the provided +block+ in turn.
  933. #
  934. # The +options+ parameter can be anything FasterCSV::new() understands.
  935. #
  936. def self.foreach(path, options =, &block)
  937. open(path, "rb", options) do |csv|
  938. csv.each(&block)
  939. end
  940. end
  941. #
  942. # :call-seq:
  943. # generate( str, options = ) { |faster_csv| ... }
  944. # generate( options = ) { |faster_csv| ... }
  945. #
  946. # This method wraps a String you provide, or an empty default String, in a
  947. # FasterCSV object which is passed to the provided block. You can use the
  948. # block to append CSV rows to the String and when the block exits, the
  949. # final String will be returned.
  950. #
  951. # Note that a passed String *is* modfied by this method. Call dup() before
  952. # passing if you need a new String.
  953. #
  954. # The +options+ parameter can be anthing FasterCSV::new() understands.
  955. #
  956. def self.generate(*args)
  957. # add a default empty String, if none was given
  958. if args.first.is_a? String
  959. io =
  960., IO::SEEK_END)
  961. args.unshift(io)
  962. else
  963. args.unshift("")
  964. end
  965. faster_csv = new(*args) # wrap
  966. yield faster_csv # yield for appending
  967. faster_csv.string # return final String
  968. end
  969. #
  970. # This method is a shortcut for converting a single row (Array) into a CSV
  971. # String.
  972. #
  973. # The +options+ parameter can be anthing FasterCSV::new() understands.
  974. #
  975. # The <tt>:row_sep</tt> +option+ defaults to <tt>$INPUT_RECORD_SEPARATOR</tt>
  976. # (<tt>$/</tt>) when calling this method.
  977. #
  978. def self.generate_line(row, options =
  979. options = {:row_sep => $INPUT_RECORD_SEPARATOR}.merge(options)
  980. (new("", options) << row).string
  981. end
  982. #
  983. # This method will return a FasterCSV instance, just like FasterCSV::new(),
  984. # but the instance will be cached and returned for all future calls to this
  985. # method for the same +data+ object (tested by Object#object_id()) with the
  986. # same +options+.
  987. #
  988. # If a block is given, the instance is passed to the block and the return
  989. # value becomes the return value of the block.
  990. #
  991. def self.instance(data = $stdout, options =
  992. # create a _signature_ for this method call, data object and options
  993. sig = [data.object_id] +
  994. options.values_at(*DEFAULT_OPTIONS.keys.sort_by { |sym| sym.to_s })
  995. # fetch or create the instance for this signature
  996. @@instances ||=
  997. instance = (@@instances[sig] ||= new(data, options))
  998. if block_given?
  999. yield instance # run block, if given, returning result
  1000. else
  1001. instance # or return the instance
  1002. end
  1003. end
  1004. #
  1005. # This method is the reading counterpart to FasterCSV::dump(). See that
  1006. # method for a detailed description of the process.
  1007. #
  1008. # You can customize loading by adding a class method called csv_load() which
  1009. # will be passed a Hash of meta information, an Array of headers, and an Array
  1010. # of fields for the object the method is expected to return.
  1011. #
  1012. # Remember that all fields will be Strings after this load. If you need
  1013. # something else, use +options+ to setup converters or provide a custom
  1014. # csv_load() implementation.
  1015. #
  1016. def self.load(io_or_str, options =
  1017. csv =, options)
  1018. # load meta information
  1019. meta = Hash[*csv.shift]
  1020. cls = meta["class"].split("::").inject(Object) do |c, const|
  1021. c.const_get(const)
  1022. end
  1023. # load headers
  1024. headers = csv.shift
  1025. # unserialize each object stored in the file
  1026. results = csv.inject( do |all, row|
  1027. begin
  1028. obj = cls.csv_load(meta, headers, row)
  1029. rescue NoMethodError
  1030. obj = cls.allocate
  1031. do |name, value|
  1032. if name[0] == ?@
  1033. obj.instance_variable_set(name, value)
  1034. else
  1035. obj.send(name, value)
  1036. end
  1037. end
  1038. end
  1039. all << obj
  1040. end
  1041. csv.close unless io_or_str.is_a? String
  1042. results
  1043. end
  1044. #
  1045. # :call-seq:
  1046. # open( filename, mode="rb", options = ) { |faster_csv| ... }
  1047. # open( filename, mode="rb", options = )
  1048. #
  1049. # This method opens an IO object, and wraps that with FasterCSV. This is
  1050. # intended as the primary interface for writing a CSV file.
  1051. #
  1052. # You may pass any +args+ Ruby's open() understands followed by an optional
  1053. # Hash containing any +options+ FasterCSV::new() understands.
  1054. #
  1055. # This method works like Ruby's open() call, in that it will pass a FasterCSV
  1056. # object to a provided block and close it when the block termminates, or it
  1057. # will return the FasterCSV object when no block is provided. (*Note*: This
  1058. # is different from the standard CSV library which passes rows to the block.
  1059. # Use FasterCSV::foreach() for that behavior.)
  1060. #
  1061. # An opened FasterCSV object will delegate to many IO methods, for
  1062. # convenience. You may call:
  1063. #
  1064. # * binmode()
  1065. # * close()
  1066. # * close_read()
  1067. # * close_write()
  1068. # * closed?()
  1069. # * eof()
  1070. # * eof?()
  1071. # * fcntl()
  1072. # * fileno()
  1073. # * flush()
  1074. # * fsync()
  1075. # * ioctl()
  1076. # * isatty()
  1077. # * pid()
  1078. # * pos()
  1079. # * reopen()
  1080. # * seek()
  1081. # * stat()
  1082. # * sync()
  1083. # * sync=()
  1084. # * tell()
  1085. # * to_i()
  1086. # * to_io()
  1087. # * tty?()
  1088. #
  1089. def*args)
  1090. # find the +options+ Hash
  1091. options = if args.last.is_a? Hash then args.pop else end
  1092. # default to a binary open mode
  1093. args << "rb" if args.size == 1
  1094. # wrap a File opened with the remaining +args+
  1095. csv = new(*args), options)
  1096. # handle blocks like Ruby's open(), not like the CSV library
  1097. if block_given?
  1098. begin
  1099. yield csv
  1100. ensure
  1101. csv.close
  1102. end
  1103. else
  1104. csv
  1105. end
  1106. end
  1107. #
  1108. # :call-seq:
  1109. # parse( str, options = ) { |row| ... }
  1110. # parse( str, options = )
  1111. #
  1112. # This method can be used to easily parse CSV out of a String. You may either
  1113. # provide a +block+ which will be called with each row of the String in turn,
  1114. # or just use the returned Array of Arrays (when no +block+ is given).
  1115. #
  1116. # You pass your +str+ to read from, and an optional +options+ Hash containing
  1117. # anything FasterCSV::new() understands.
  1118. #
  1119. def self.parse(*args, &block)
  1120. csv = new(*args)
  1121. if block.nil? # slurp contents, if no block is given
  1122. begin
  1124. ensure
  1125. csv.close
  1126. end
  1127. else # or pass each row to a provided block
  1128. csv.each(&block)
  1129. end
  1130. end
  1131. #
  1132. # This method is a shortcut for converting a single line of a CSV String into
  1133. # a into an Array. Note that if +line+ contains multiple rows, anything
  1134. # beyond the first row is ignored.
  1135. #
  1136. # The +options+ parameter can be anthing FasterCSV::new() understands.
  1137. #
  1138. def self.parse_line(line, options =
  1139. new(line, options).shift
  1140. end
  1141. #
  1142. # Use to slurp a CSV file into an Array of Arrays. Pass the +path+ to the
  1143. # file and any +options+ FasterCSV::new() understands.
  1144. #
  1145. def, options =
  1146. open(path, "rb", options) { |csv| }
  1147. end
  1148. # Alias for FasterCSV::read().
  1149. def self.readlines(*args)
  1150. read(*args)
  1151. end
  1152. #
  1153. # A shortcut for:
  1154. #
  1155. # path, { :headers => true,
  1156. # :converters => :numeric,
  1157. # :header_converters => :symbol }.merge(options) )
  1158. #
  1159. def self.table(path, options =
  1160. read( path, { :headers => true,
  1161. :converters => :numeric,
  1162. :header_converters => :symbol }.merge(options) )
  1163. end
  1164. #
  1165. # This constructor will wrap either a String or IO object passed in +data+ for
  1166. # reading and/or writing. In addition to the FasterCSV instance methods,
  1167. # several IO methods are delegated. (See FasterCSV::open() for a complete
  1168. # list.) If you pass a String for +data+, you can later retrieve it (after
  1169. # writing to it, for example) with FasterCSV.string().
  1170. #
  1171. # Note that a wrapped String will be positioned at at the beginning (for
  1172. # reading). If you want it at the end (for writing), use
  1173. # FasterCSV::generate(). If you want any other positioning, pass a preset
  1174. # StringIO object instead.
  1175. #
  1176. # You may set any reading and/or writing preferences in the +options+ Hash.
  1177. # Available options are:
  1178. #
  1179. # <b><tt>:col_sep</tt></b>:: The String placed between each field.
  1180. # <b><tt>:row_sep</tt></b>:: The String appended to the end of each
  1181. # row. This can be set to the special
  1182. # <tt>:auto</tt> setting, which requests
  1183. # that FasterCSV automatically discover
  1184. # this from the data. Auto-discovery
  1185. # reads ahead in the data looking for
  1186. # the next <tt>"\r\n"</tt>,
  1187. # <tt>"\n"</tt>, or <tt>"\r"</tt>
  1188. # sequence. A sequence will be selected
  1189. # even if it occurs in a quoted field,
  1190. # assuming that you would have the same
  1191. # line endings there. If none of those
  1192. # sequences is found, +data+ is
  1193. # <tt>ARGF</tt>, <tt>STDIN</tt>,
  1194. # <tt>STDOUT</tt>, or <tt>STDERR</tt>,
  1195. # or the stream is only available for
  1196. # output, the default
  1197. # <tt>$INPUT_RECORD_SEPARATOR</tt>
  1198. # (<tt>$/</tt>) is used. Obviously,
  1199. # discovery takes a little time. Set
  1200. # manually if speed is important. Also
  1201. # note that IO objects should be opened
  1202. # in binary mode on Windows if this
  1203. # feature will be used as the
  1204. # line-ending translation can cause
  1205. # problems with resetting the document
  1206. # position to where it was before the
  1207. # read ahead.
  1208. # <b><tt>:quote_char</tt></b>:: The character used to quote fields.
  1209. # This has to be a single character
  1210. # String. This is useful for
  1211. # application that incorrectly use
  1212. # <tt>'</tt> as the quote character
  1213. # instead of the correct <tt>"</tt>.
  1214. # FasterCSV will always consider a
  1215. # double sequence this character to be
  1216. # an escaped quote.
  1217. # <b><tt>:encoding</tt></b>:: The encoding to use when parsing the
  1218. # file. Defaults to your <tt>$KDOCE</tt>
  1219. # setting. Valid values: <tt>`n’</tt> or
  1220. # <tt>`N’</tt> for none, <tt>`e’</tt> or
  1221. # <tt>`E’</tt> for EUC, <tt>`s’</tt> or
  1222. # <tt>`S’</tt> for SJIS, and
  1223. # <tt>`u’</tt> or <tt>`U’</tt> for UTF-8
  1224. # (see
  1225. # <b><tt>:field_size_limit</tt></b>:: This is a maximum size FasterCSV will
  1226. # read ahead looking for the closing
  1227. # quote for a field. (In truth, it
  1228. # reads to the first line ending beyond
  1229. # this size.) If a quote cannot be
  1230. # found within the limit FasterCSV will
  1231. # raise a MalformedCSVError, assuming
  1232. # the data is faulty. You can use this
  1233. # limit to prevent what are effectively
  1234. # DoS attacks on the parser. However,
  1235. # this limit can cause a legitimate
  1236. # parse to fail and thus is set to
  1237. # +nil+, or off, by default.
  1238. # <b><tt>:converters</tt></b>:: An Array of names from the Converters
  1239. # Hash and/or lambdas that handle custom
  1240. # conversion. A single converter
  1241. # doesn't have to be in an Array.
  1242. # <b><tt>:unconverted_fields</tt></b>:: If set to +true+, an
  1243. # unconverted_fields() method will be
  1244. # added to all returned rows (Array or
  1245. # FasterCSV::Row) that will return the
  1246. # fields as they were before convertion.
  1247. # Note that <tt>:headers</tt> supplied
  1248. # by Array or String were not fields of
  1249. # the document and thus will have an
  1250. # empty Array attached.
  1251. # <b><tt>:headers</tt></b>:: If set to <tt>:first_row</tt> or
  1252. # +true+, the initial row of the CSV
  1253. # file will be treated as a row of
  1254. # headers. If set to an Array, the
  1255. # contents will be used as the headers.
  1256. # If set to a String, the String is run
  1257. # through a call of
  1258. # FasterCSV::parse_line() with the same
  1259. # <tt>:col_sep</tt>, <tt>:row_sep</tt>,
  1260. # and <tt>:quote_char</tt> as this
  1261. # instance to produce an Array of
  1262. # headers. This setting causes
  1263. # FasterCSV.shift() to return rows as
  1264. # FasterCSV::Row objects instead of
  1265. # Arrays and to return
  1266. # FasterCSV::Table objects instead of
  1267. # an Array of Arrays.
  1268. # <b><tt>:return_headers</tt></b>:: When +false+, header rows are silently
  1269. # swallowed. If set to +true+, header
  1270. # rows are returned in a FasterCSV::Row
  1271. # object with identical headers and
  1272. # fields (save that the fields do not go
  1273. # through the converters).
  1274. # <b><tt>:write_headers</tt></b>:: When +true+ and <tt>:headers</tt> is
  1275. # set, a header row will be added to the
  1276. # output.
  1277. # <b><tt>:header_converters</tt></b>:: Identical in functionality to
  1278. # <tt>:converters</tt> save that the
  1279. # conversions are only made to header
  1280. # rows.
  1281. # <b><tt>:skip_blanks</tt></b>:: When set to a +true+ value, FasterCSV
  1282. # will skip over any rows with no
  1283. # content.
  1284. # <b><tt>:force_quotes</tt></b>:: When set to a +true+ value, FasterCSV
  1285. # will quote all CSV fields it creates.
  1286. #
  1287. # See FasterCSV::DEFAULT_OPTIONS for the default settings.
  1288. #
  1289. # Options cannot be overriden in the instance methods for performance reasons,
  1290. # so be sure to set what you want here.
  1291. #
  1292. def initialize(data, options =
  1293. # build the options for this read/write
  1294. options = DEFAULT_OPTIONS.merge(options)
  1295. # create the IO object we will read from
  1296. @io = if data.is_a? String then else data end
  1297. init_separators(options)
  1298. init_parsers(options)
  1299. init_converters(options)
  1300. init_headers(options)
  1301. unless options.empty?
  1302. raise ArgumentError, "Unknown options: #{options.keys.join(', ')}."
  1303. end
  1304. # track our own lineno since IO gets confused about line-ends is CSV fields
  1305. @lineno = 0
  1306. end
  1307. #
  1308. # The line number of the last row read from this file. Fields with nested
  1309. # line-end characters will not affect this count.
  1310. #
  1311. attr_reader :lineno
  1312. ### IO and StringIO Delegation ###
  1313. extend Forwardable
  1314. def_delegators :@io, :binmode, :close, :close_read, :close_write, :closed?,
  1315. :eof, :eof?, :fcntl, :fileno, :flush, :fsync, :ioctl,
  1316. :isatty, :pid, :pos, :reopen, :seek, :stat, :string,
  1317. :sync, :sync=, :tell, :to_i, :to_io, :tty?
  1318. # Rewinds the underlying IO object and resets FasterCSV's lineno() counter.
  1319. def rewind
  1320. @headers = nil
  1321. @lineno = 0
  1322. @io.rewind
  1323. end
  1324. ### End Delegation ###
  1325. #
  1326. # The primary write method for wrapped Strings and IOs, +row+ (an Array or
  1327. # FasterCSV::Row) is converted to CSV and appended to the data source. When a
  1328. # FasterCSV::Row is passed, only the row's fields() are appended to the
  1329. # output.
  1330. #
  1331. # The data source must be open for writing.
  1332. #
  1333. def <<(row)
  1334. # make sure headers have been assigned
  1335. if header_row? and [Array, String].include? @use_headers.class
  1336. parse_headers # won't read data for Array or String
  1337. self << @headers if @write_headers
  1338. end
  1339. # Handle FasterCSV::Row objects and Hashes
  1340. row = case row
  1341. when self.class::Row then row.fields
  1342. when Hash then { |header| row[header] }
  1343. else row
  1344. end
  1345. @headers = row if header_row?
  1346. @lineno += 1
  1347. @io << + @row_sep # quote and separate
  1348. self # for chaining
  1349. end
  1350. alias_method :add_row, :<<
  1351. alias_method :puts, :<<
  1352. #
  1353. # :call-seq:
  1354. # convert( name )
  1355. # convert { |field| ... }
  1356. # convert { |field, field_info| ... }
  1357. #
  1358. # You can use this method to install a FasterCSV::Converters built-in, or
  1359. # provide a block that handles a custom conversion.
  1360. #
  1361. # If you provide a block that takes one argument, it will be passed the field
  1362. # and is expected to return the converted value or the field itself. If your
  1363. # block takes two arguments, it will also be passed a FieldInfo Struct,
  1364. # containing details about the field. Again, the block should return a
  1365. # converted field or the field itself.
  1366. #
  1367. def convert(name = nil, &converter)
  1368. add_converter(:converters, self.class::Converters, name, &converter)
  1369. end
  1370. #
  1371. # :call-seq:
  1372. # header_convert( name )
  1373. # header_convert { |field| ... }
  1374. # header_convert { |field, field_info| ... }
  1375. #
  1376. # Identical to FasterCSV.convert(), but for header rows.
  1377. #
  1378. # Note that this method must be called before header rows are read to have any
  1379. # effect.
  1380. #
  1381. def header_convert(name = nil, &converter)
  1382. add_converter( :header_converters,
  1383. self.class::HeaderConverters,
  1384. name,
  1385. &converter )
  1386. end
  1387. include Enumerable
  1388. #
  1389. # Yields each row of the data source in turn.
  1390. #
  1391. # Support for Enumerable.
  1392. #
  1393. # The data source must be open for reading.
  1394. #
  1395. def each
  1396. while row = shift
  1397. yield row
  1398. end
  1399. end
  1400. #
  1401. # Slurps the remaining rows and returns an Array of Arrays.
  1402. #
  1403. # The data source must be open for reading.
  1404. #
  1405. def read
  1406. rows = to_a
  1407. if @use_headers
  1409. else
  1410. rows
  1411. end
  1412. end
  1413. alias_method :readlines, :read
  1414. # Returns +true+ if the next row read will be a header row.
  1415. def header_row?
  1416. @use_headers and @headers.nil?
  1417. end
  1418. #
  1419. # The primary read method for wrapped Strings and IOs, a single row is pulled
  1420. # from the data source, parsed and returned as an Array of fields (if header
  1421. # rows are not used) or a FasterCSV::Row (when header rows are used).
  1422. #
  1423. # The data source must be open for reading.
  1424. #
  1425. def shift
  1426. #########################################################################
  1427. ### This method is purposefully kept a bit long as simple conditional ###
  1428. ### checks are faster than numerous (expensive) method calls. ###
  1429. #########################################################################
  1430. # handle headers not based on document content
  1431. if header_row? and @return_headers and
  1432. [Array, String].include? @use_headers.class
  1433. if @unconverted_fields
  1434. return add_unconverted_fields(parse_headers,
  1435. else
  1436. return parse_headers
  1437. end
  1438. end
  1439. # begin with a blank line, so we can always add to it
  1440. line =
  1441. #
  1442. # it can take multiple calls to <tt>@io.gets()</tt> to get a full line,
  1443. # because of \r and/or \n characters embedded in quoted fields
  1444. #
  1445. loop do
  1446. # add another read to the line
  1447. begin
  1448. line += @io.gets(@row_sep)
  1449. rescue
  1450. return nil
  1451. end
  1452. # copy the line so we can chop it up in parsing
  1453. parse = line.dup
  1454. parse.sub!(@parsers[:line_end], "")
  1455. #
  1456. # I believe a blank line should be an <tt></tt>, not
  1457. # CSV's <tt>[nil]</tt>
  1458. #
  1459. if parse.empty?
  1460. @lineno += 1
  1461. if @skip_blanks
  1462. line = ""
  1463. next
  1464. elsif @unconverted_fields
  1465. return add_unconverted_fields(,
  1466. elsif @use_headers
  1467. return,
  1468. else
  1469. return
  1470. end
  1471. end
  1472. # parse the fields with a mix of String#split and regular expressions
  1473. csv =
  1474. current_field =
  1475. field_quotes = 0
  1476. parse.split(@col_sep, -1).each do |match|
  1477. if current_field.empty? && match.count(@quote_and_newlines).zero?
  1478. csv << (match.empty? ? nil : match)
  1479. elsif(current_field.empty? ? match[0] : current_field[0]) == @quote_char[0]
  1480. current_field << match
  1481. field_quotes += match.count(@quote_char)
  1482. if field_quotes % 2 == 0
  1483. in_quotes = current_field[@parsers[:quoted_field], 1]
  1484. raise MalformedCSVError unless in_quotes
  1485. current_field = in_quotes
  1486. current_field.gsub!(@quote_char * 2, @quote_char) # unescape contents
  1487. csv << current_field
  1488. current_field =
  1489. field_quotes = 0
  1490. else # we found a quoted field that spans multiple lines
  1491. current_field << @col_sep
  1492. end
  1493. elsif match.count("\r\n").zero?
  1494. raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}."
  1495. else
  1496. raise MalformedCSVError, "Unquoted fields do not allow " +
  1497. "\\r or \\n (line #{lineno + 1})."
  1498. end
  1499. end
  1500. # if parse is empty?(), we found all the fields on the line...
  1501. if field_quotes % 2 == 0
  1502. @lineno += 1
  1503. # save fields unconverted fields, if needed...
  1504. unconverted = csv.dup if @unconverted_fields
  1505. # convert fields, if needed...
  1506. csv = convert_fields(csv) unless @use_headers or @converters.empty?
  1507. # parse out header rows and handle FasterCSV::Row conversions...
  1508. csv = parse_headers(csv) if @use_headers
  1509. # inject unconverted fields and accessor, if requested...
  1510. if @unconverted_fields and not csv.respond_to? :unconverted_fields
  1511. add_unconverted_fields(csv, unconverted)
  1512. end
  1513. # return the results
  1514. break csv
  1515. end
  1516. # if we're not empty?() but at eof?(), a quoted field wasn't closed...
  1517. if @io.eof?
  1518. raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}."
  1519. elsif @field_size_limit and current_field.size >= @field_size_limit
  1520. raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}."
  1521. end
  1522. # otherwise, we need to loop and pull some more data to complete the row
  1523. end
  1524. end
  1525. alias_method :gets, :shift
  1526. alias_method :readline, :shift
  1527. # Returns a simplified description of the key FasterCSV attributes.
  1528. def inspect
  1529. str = "<##{self.class} io_type:"
  1530. # show type of wrapped IO
  1531. if @io == $stdout then str << "$stdout"
  1532. elsif @io == $stdin then str << "$stdin"
  1533. elsif @io == $stderr then str << "$stderr"
  1534. else str << @io.class.to_s
  1535. end
  1536. # show IO.path(), if available
  1537. if @io.respond_to?(:path) and (p = @io.path)
  1538. str << " io_path:#{p.inspect}"
  1539. end
  1540. # show other attributes
  1541. %w[ lineno col_sep row_sep
  1542. quote_char skip_blanks encoding ].each do |attr_name|
  1543. if a = instance_variable_get("@#{attr_name}")
  1544. str << " #{attr_name}:#{a.inspect}"
  1545. end
  1546. end
  1547. if @use_headers
  1548. str << " headers:#{(@headers || true).inspect}"
  1549. end
  1550. str << ">"
  1551. end
  1552. private
  1553. #
  1554. # Stores the indicated separators for later use.
  1555. #
  1556. # If auto-discovery was requested for <tt>@row_sep</tt>, this method will read
  1557. # ahead in the <tt>@io</tt> and try to find one. +ARGF+, +STDIN+, +STDOUT+,
  1558. # +STDERR+ and any stream open for output only with a default
  1559. # <tt>@row_sep</tt> of <tt>$INPUT_RECORD_SEPARATOR</tt> (<tt>$/</tt>).
  1560. #
  1561. # This method also establishes the quoting rules used for CSV output.
  1562. #
  1563. def init_separators(options)
  1564. # store the selected separators
  1565. @col_sep = options.delete(:col_sep)
  1566. @row_sep = options.delete(:row_sep)
  1567. @quote_char = options.delete(:quote_char)
  1568. @quote_and_newlines = "#{@quote_char}\r\n"
  1569. if @quote_char.length != 1
  1570. raise ArgumentError, ":quote_char has to be a single character String"
  1571. end
  1572. # automatically discover row separator when requested
  1573. if @row_sep == :auto
  1574. if [ARGF, STDIN, STDOUT, STDERR].include?(@io) or
  1575. (defined?(Zlib) and @io.class == Zlib::GzipWriter)
  1576. @row_sep = $INPUT_RECORD_SEPARATOR
  1577. else
  1578. begin
  1579. saved_pos = @io.pos # remember where we were
  1580. while @row_sep == :auto
  1581. #
  1582. # if we run out of data, it's probably a single line
  1583. # (use a sensible default)
  1584. #
  1585. if @io.eof?
  1586. @row_sep = $INPUT_RECORD_SEPARATOR
  1587. break
  1588. end
  1589. # read ahead a bit
  1590. sample =
  1591. sample += if sample[-1..-1] == "\r" and not @io.eof?
  1592. # try to find a standard separator
  1593. if sample =~ /\r\n?|\n/
  1594. @row_sep = $&
  1595. break
  1596. end
  1597. end
  1598. # tricky seek() clone to work around GzipReader's lack of seek()
  1599. @io.rewind
  1600. # reset back to the remembered position
  1601. while saved_pos > 1024 # avoid loading a lot of data into memory
  1603. saved_pos -= 1024
  1604. end
  1605. if saved_pos.nonzero?
  1606. rescue IOError # stream not opened for reading
  1607. @row_sep = $INPUT_RECORD_SEPARATOR
  1608. end
  1609. end
  1610. end
  1611. # establish quoting rules
  1612. do_quote = lambda do |field|
  1613. @quote_char +
  1614. String(field).gsub(@quote_char, @quote_char * 2) +
  1615. @quote_char
  1616. end
  1617. @quote = if options.delete(:force_quotes)
  1618. do_quote
  1619. else
  1620. lambda do |field|
  1621. if field.nil? # represent +nil+ fields as empty unquoted fields
  1622. ""
  1623. else
  1624. field = String(field) # Stringify fields
  1625. # represent empty fields as empty quoted fields
  1626. if field.empty? or
  1627. field.count("\r\n#{@col_sep}#{@quote_char}").nonzero?
  1629. else
  1630. field # unquoted field
  1631. end
  1632. end
  1633. end
  1634. end
  1635. end
  1636. # Pre-compiles parsers and stores them by name for access during reads.
  1637. def init_parsers(options)
  1638. # store the parser behaviors
  1639. @skip_blanks = options.delete(:skip_blanks)
  1640. @encoding = options.delete(:encoding) # nil will use $KCODE
  1641. @field_size_limit = options.delete(:field_size_limit)
  1642. # prebuild Regexps for faster parsing
  1643. esc_col_sep = Regexp.escape(@col_sep)
  1644. esc_row_sep = Regexp.escape(@row_sep)
  1645. esc_quote = Regexp.escape(@quote_char)
  1646. @parsers = {
  1647. :any_field => "[^#{esc_col_sep}]+",
  1648. Regexp::MULTILINE,
  1649. @encoding ),
  1650. :quoted_field => "^#{esc_quote}(.*)#{esc_quote}$",
  1651. Regexp::MULTILINE,
  1652. @encoding ),
  1653. # safer than chomp!()
  1654. :line_end =>"#{esc_row_sep}\\z", nil, @encoding)
  1655. }
  1656. end
  1657. #
  1658. # Loads any converters requested during construction.
  1659. #
  1660. # If +field_name+ is set <tt>:converters</tt> (the default) field converters
  1661. # are set. When +field_name+ is <tt>:header_converters</tt> header converters
  1662. # are added instead.
  1663. #
  1664. # The <tt>:unconverted_fields</tt> option is also actived for
  1665. # <tt>:converters</tt> calls, if requested.
  1666. #
  1667. def init_converters(options, field_name = :converters)
  1668. if field_name == :converters
  1669. @unconverted_fields = options.delete(:unconverted_fields)
  1670. end
  1671. instance_variable_set("@#{field_name}",
  1672. # find the correct method to add the coverters
  1673. convert = method(field_name.to_s.sub(/ers\Z/, ""))
  1674. # load converters
  1675. unless options[field_name].nil?
  1676. # allow a single converter not wrapped in an Array
  1677. unless options[field_name].is_a? Array
  1678. options[field_name] = [options[field_name]]
  1679. end
  1680. # load each converter...
  1681. options[field_name].each do |converter|
  1682. if converter.is_a? Proc # custom code block
  1684. else # by name
  1686. end
  1687. end
  1688. end
  1689. options.delete(field_name)
  1690. end
  1691. # Stores header row settings and loads header converters, if needed.
  1692. def init_headers(options)
  1693. @use_headers = options.delete(:headers)
  1694. @return_headers = options.delete(:return_headers)
  1695. @write_headers = options.delete(:write_headers)
  1696. # headers must be delayed until shift(), in case they need a row of content
  1697. @headers = nil
  1698. init_converters(options, :header_converters)
  1699. end
  1700. #
  1701. # The actual work method for adding converters, used by both
  1702. # FasterCSV.convert() and FasterCSV.header_convert().
  1703. #
  1704. # This method requires the +var_name+ of the instance variable to place the
  1705. # converters in, the +const+ Hash to lookup named converters in, and the
  1706. # normal parameters of the FasterCSV.convert() and FasterCSV.header_convert()
  1707. # methods.
  1708. #
  1709. def add_converter(var_name, const, name = nil, &converter)
  1710. if name.nil? # custom converter
  1711. instance_variable_get("@#{var_name}") << converter
  1712. else # named converter
  1713. combo = const[name]
  1714. case combo
  1715. when Array # combo converter
  1716. combo.each do |converter_name|
  1717. add_converter(var_name, const, converter_name)
  1718. end
  1719. else # individual named converter
  1720. instance_variable_get("@#{var_name}") << combo
  1721. end
  1722. end
  1723. end
  1724. #
  1725. # Processes +fields+ with <tt>@converters</tt>, or <tt>@header_converters</tt>
  1726. # if +headers+ is passed as +true+, returning the converted field set. Any
  1727. # converter that changes the field into something other than a String halts
  1728. # the pipeline of conversion for that field. This is primarily an efficiency
  1729. # shortcut.
  1730. #
  1731. def convert_fields(fields, headers = false)
  1732. # see if we are converting headers or fields
  1733. converters = headers ? @header_converters : @converters
  1734. fields.enum_for(:each_with_index).map do |field, index| # map_with_index
  1735. converters.each do |converter|
  1736. field = if converter.arity == 1 # straight field converter
  1737. converter[field]
  1738. else # FieldInfo converter
  1739. header = @use_headers && !headers ? @headers[index] : nil
  1740. converter[field,, lineno, header)]
  1741. end
  1742. break unless field.is_a? String # short-curcuit pipeline for speed
  1743. end
  1744. field # return final state of each field, converted or original
  1745. end
  1746. end
  1747. #
  1748. # This methods is used to turn a finished +row+ into a FasterCSV::Row. Header
  1749. # rows are also dealt with here, either by returning a FasterCSV::Row with
  1750. # identical headers and fields (save that the fields do not go through the
  1751. # converters) or by reading past them to return a field row. Headers are also
  1752. # saved in <tt>@headers</tt> for use in future rows.
  1753. #
  1754. # When +nil+, +row+ is assumed to be a header row not based on an actual row
  1755. # of the stream.
  1756. #
  1757. def parse_headers(row = nil)
  1758. if @headers.nil? # header row
  1759. @headers = case @use_headers # save headers
  1760. # Array of headers
  1761. when Array then @use_headers
  1762. # CSV header String
  1763. when String
  1764. self.class.parse_line( @use_headers,
  1765. :col_sep => @col_sep,
  1766. :row_sep => @row_sep,
  1767. :quote_char => @quote_char )
  1768. # first row is headers
  1769. else row
  1770. end
  1771. # prepare converted and unconverted copies
  1772. row = @headers if row.nil?
  1773. @headers = convert_fields(@headers, true)
  1774. if @return_headers # return headers
  1775. return, row, true)
  1776. elsif not [Array, String].include? @use_headers.class # skip to field row
  1777. return shift
  1778. end
  1779. end
  1780., convert_fields(row)) # field row
  1781. end
  1782. #
  1783. # Thiw methods injects an instance variable <tt>unconverted_fields</tt> into
  1784. # +row+ and an accessor method for it called unconverted_fields(). The
  1785. # variable is set to the contents of +fields+.
  1786. #
  1787. def add_unconverted_fields(row, fields)
  1788. class << row
  1789. attr_reader :unconverted_fields
  1790. end
  1791. row.instance_eval { @unconverted_fields = fields }
  1792. row
  1793. end
  1794. end
  1795. # Another name for FasterCSV.
  1796. FCSV = FasterCSV
  1797. # Another name for FasterCSV::instance().
  1798. def FasterCSV(*args, &block)
  1799. FasterCSV.instance(*args, &block)
  1800. end
  1801. # Another name for FCSV::instance().
  1802. def FCSV(*args, &block)
  1803. FCSV.instance(*args, &block)
  1804. end
  1805. class Array
  1806. # Equivalent to <tt>FasterCSV::generate_line(self, options)</tt>.
  1807. def to_csv(options =
  1808. FasterCSV.generate_line(self, options)
  1809. end
  1810. end
  1811. class String
  1812. # Equivalent to <tt>FasterCSV::parse_line(self, options)</tt>.
  1813. def parse_csv(options =
  1814. FasterCSV.parse_line(self, options)
  1815. end
  1816. end