parsers.py 124 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671
  1. """
  2. Module contains tools for processing files into DataFrames or other objects
  3. """
  4. from collections import abc, defaultdict
  5. import csv
  6. import datetime
  7. from io import StringIO, TextIOWrapper
  8. import re
  9. import sys
  10. from textwrap import fill
  11. from typing import Any, Dict, Set
  12. import warnings
  13. import numpy as np
  14. import pandas._libs.lib as lib
  15. import pandas._libs.ops as libops
  16. import pandas._libs.parsers as parsers
  17. from pandas._libs.parsers import STR_NA_VALUES
  18. from pandas._libs.tslibs import parsing
  19. from pandas._typing import FilePathOrBuffer
  20. from pandas.errors import (
  21. AbstractMethodError,
  22. EmptyDataError,
  23. ParserError,
  24. ParserWarning,
  25. )
  26. from pandas.util._decorators import Appender
  27. from pandas.core.dtypes.cast import astype_nansafe
  28. from pandas.core.dtypes.common import (
  29. ensure_object,
  30. ensure_str,
  31. is_bool_dtype,
  32. is_categorical_dtype,
  33. is_dtype_equal,
  34. is_extension_array_dtype,
  35. is_file_like,
  36. is_float,
  37. is_integer,
  38. is_integer_dtype,
  39. is_list_like,
  40. is_object_dtype,
  41. is_scalar,
  42. is_string_dtype,
  43. pandas_dtype,
  44. )
  45. from pandas.core.dtypes.dtypes import CategoricalDtype
  46. from pandas.core.dtypes.missing import isna
  47. from pandas.core import algorithms
  48. from pandas.core.arrays import Categorical
  49. from pandas.core.frame import DataFrame
  50. from pandas.core.indexes.api import (
  51. Index,
  52. MultiIndex,
  53. RangeIndex,
  54. ensure_index_from_sequences,
  55. )
  56. from pandas.core.series import Series
  57. from pandas.core.tools import datetimes as tools
  58. from pandas.io.common import (
  59. get_filepath_or_buffer,
  60. get_handle,
  61. infer_compression,
  62. validate_header_arg,
  63. )
  64. from pandas.io.date_converters import generic_parser
  65. # BOM character (byte order mark)
  66. # This exists at the beginning of a file to indicate endianness
  67. # of a file (stream). Unfortunately, this marker screws up parsing,
  68. # so we need to remove it if we see it.
  69. _BOM = "\ufeff"
  70. _doc_read_csv_and_table = (
  71. r"""
  72. {summary}
  73. Also supports optionally iterating or breaking of the file
  74. into chunks.
  75. Additional help can be found in the online docs for
  76. `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
  77. Parameters
  78. ----------
  79. filepath_or_buffer : str, path object or file-like object
  80. Any valid string path is acceptable. The string could be a URL. Valid
  81. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  82. expected. A local file could be: file://localhost/path/to/table.csv.
  83. If you want to pass in a path object, pandas accepts any ``os.PathLike``.
  84. By file-like object, we refer to objects with a ``read()`` method, such as
  85. a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
  86. sep : str, default {_default_sep}
  87. Delimiter to use. If sep is None, the C engine cannot automatically detect
  88. the separator, but the Python parsing engine can, meaning the latter will
  89. be used and automatically detect the separator by Python's builtin sniffer
  90. tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
  91. different from ``'\s+'`` will be interpreted as regular expressions and
  92. will also force the use of the Python parsing engine. Note that regex
  93. delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
  94. delimiter : str, default ``None``
  95. Alias for sep.
  96. header : int, list of int, default 'infer'
  97. Row number(s) to use as the column names, and the start of the
  98. data. Default behavior is to infer the column names: if no names
  99. are passed the behavior is identical to ``header=0`` and column
  100. names are inferred from the first line of the file, if column
  101. names are passed explicitly then the behavior is identical to
  102. ``header=None``. Explicitly pass ``header=0`` to be able to
  103. replace existing names. The header can be a list of integers that
  104. specify row locations for a multi-index on the columns
  105. e.g. [0,1,3]. Intervening rows that are not specified will be
  106. skipped (e.g. 2 in this example is skipped). Note that this
  107. parameter ignores commented lines and empty lines if
  108. ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
  109. data rather than the first line of the file.
  110. names : array-like, optional
  111. List of column names to use. If the file contains a header row,
  112. then you should explicitly pass ``header=0`` to override the column names.
  113. Duplicates in this list are not allowed.
  114. index_col : int, str, sequence of int / str, or False, default ``None``
  115. Column(s) to use as the row labels of the ``DataFrame``, either given as
  116. string name or column index. If a sequence of int / str is given, a
  117. MultiIndex is used.
  118. Note: ``index_col=False`` can be used to force pandas to *not* use the first
  119. column as the index, e.g. when you have a malformed file with delimiters at
  120. the end of each line.
  121. usecols : list-like or callable, optional
  122. Return a subset of the columns. If list-like, all elements must either
  123. be positional (i.e. integer indices into the document columns) or strings
  124. that correspond to column names provided either by the user in `names` or
  125. inferred from the document header row(s). For example, a valid list-like
  126. `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
  127. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
  128. To instantiate a DataFrame from ``data`` with element order preserved use
  129. ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
  130. in ``['foo', 'bar']`` order or
  131. ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
  132. for ``['bar', 'foo']`` order.
  133. If callable, the callable function will be evaluated against the column
  134. names, returning names where the callable function evaluates to True. An
  135. example of a valid callable argument would be ``lambda x: x.upper() in
  136. ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
  137. parsing time and lower memory usage.
  138. squeeze : bool, default False
  139. If the parsed data only contains one column then return a Series.
  140. prefix : str, optional
  141. Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
  142. mangle_dupe_cols : bool, default True
  143. Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
  144. 'X'...'X'. Passing in False will cause data to be overwritten if there
  145. are duplicate names in the columns.
  146. dtype : Type name or dict of column -> type, optional
  147. Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
  148. 'c': 'Int64'}}
  149. Use `str` or `object` together with suitable `na_values` settings
  150. to preserve and not interpret dtype.
  151. If converters are specified, they will be applied INSTEAD
  152. of dtype conversion.
  153. engine : {{'c', 'python'}}, optional
  154. Parser engine to use. The C engine is faster while the python engine is
  155. currently more feature-complete.
  156. converters : dict, optional
  157. Dict of functions for converting values in certain columns. Keys can either
  158. be integers or column labels.
  159. true_values : list, optional
  160. Values to consider as True.
  161. false_values : list, optional
  162. Values to consider as False.
  163. skipinitialspace : bool, default False
  164. Skip spaces after delimiter.
  165. skiprows : list-like, int or callable, optional
  166. Line numbers to skip (0-indexed) or number of lines to skip (int)
  167. at the start of the file.
  168. If callable, the callable function will be evaluated against the row
  169. indices, returning True if the row should be skipped and False otherwise.
  170. An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
  171. skipfooter : int, default 0
  172. Number of lines at bottom of file to skip (Unsupported with engine='c').
  173. nrows : int, optional
  174. Number of rows of file to read. Useful for reading pieces of large files.
  175. na_values : scalar, str, list-like, or dict, optional
  176. Additional strings to recognize as NA/NaN. If dict passed, specific
  177. per-column NA values. By default the following values are interpreted as
  178. NaN: '"""
  179. + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
  180. + """'.
  181. keep_default_na : bool, default True
  182. Whether or not to include the default NaN values when parsing the data.
  183. Depending on whether `na_values` is passed in, the behavior is as follows:
  184. * If `keep_default_na` is True, and `na_values` are specified, `na_values`
  185. is appended to the default NaN values used for parsing.
  186. * If `keep_default_na` is True, and `na_values` are not specified, only
  187. the default NaN values are used for parsing.
  188. * If `keep_default_na` is False, and `na_values` are specified, only
  189. the NaN values specified `na_values` are used for parsing.
  190. * If `keep_default_na` is False, and `na_values` are not specified, no
  191. strings will be parsed as NaN.
  192. Note that if `na_filter` is passed in as False, the `keep_default_na` and
  193. `na_values` parameters will be ignored.
  194. na_filter : bool, default True
  195. Detect missing value markers (empty strings and the value of na_values). In
  196. data without any NAs, passing na_filter=False can improve the performance
  197. of reading a large file.
  198. verbose : bool, default False
  199. Indicate number of NA values placed in non-numeric columns.
  200. skip_blank_lines : bool, default True
  201. If True, skip over blank lines rather than interpreting as NaN values.
  202. parse_dates : bool or list of int or names or list of lists or dict, \
  203. default False
  204. The behavior is as follows:
  205. * boolean. If True -> try parsing the index.
  206. * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
  207. each as a separate date column.
  208. * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
  209. a single date column.
  210. * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
  211. result 'foo'
  212. If a column or index cannot be represented as an array of datetimes,
  213. say because of an unparseable value or a mixture of timezones, the column
  214. or index will be returned unaltered as an object data type. For
  215. non-standard datetime parsing, use ``pd.to_datetime`` after
  216. ``pd.read_csv``. To parse an index or column with a mixture of timezones,
  217. specify ``date_parser`` to be a partially-applied
  218. :func:`pandas.to_datetime` with ``utc=True``. See
  219. :ref:`io.csv.mixed_timezones` for more.
  220. Note: A fast-path exists for iso8601-formatted dates.
  221. infer_datetime_format : bool, default False
  222. If True and `parse_dates` is enabled, pandas will attempt to infer the
  223. format of the datetime strings in the columns, and if it can be inferred,
  224. switch to a faster method of parsing them. In some cases this can increase
  225. the parsing speed by 5-10x.
  226. keep_date_col : bool, default False
  227. If True and `parse_dates` specifies combining multiple columns then
  228. keep the original columns.
  229. date_parser : function, optional
  230. Function to use for converting a sequence of string columns to an array of
  231. datetime instances. The default uses ``dateutil.parser.parser`` to do the
  232. conversion. Pandas will try to call `date_parser` in three different ways,
  233. advancing to the next if an exception occurs: 1) Pass one or more arrays
  234. (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
  235. string values from the columns defined by `parse_dates` into a single array
  236. and pass that; and 3) call `date_parser` once for each row using one or
  237. more strings (corresponding to the columns defined by `parse_dates`) as
  238. arguments.
  239. dayfirst : bool, default False
  240. DD/MM format dates, international and European format.
  241. cache_dates : bool, default True
  242. If True, use a cache of unique, converted dates to apply the datetime
  243. conversion. May produce significant speed-up when parsing duplicate
  244. date strings, especially ones with timezone offsets.
  245. .. versionadded:: 0.25.0
  246. iterator : bool, default False
  247. Return TextFileReader object for iteration or getting chunks with
  248. ``get_chunk()``.
  249. chunksize : int, optional
  250. Return TextFileReader object for iteration.
  251. See the `IO Tools docs
  252. <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
  253. for more information on ``iterator`` and ``chunksize``.
  254. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
  255. For on-the-fly decompression of on-disk data. If 'infer' and
  256. `filepath_or_buffer` is path-like, then detect compression from the
  257. following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
  258. decompression). If using 'zip', the ZIP file must contain only one data
  259. file to be read in. Set to None for no decompression.
  260. thousands : str, optional
  261. Thousands separator.
  262. decimal : str, default '.'
  263. Character to recognize as decimal point (e.g. use ',' for European data).
  264. lineterminator : str (length 1), optional
  265. Character to break file into lines. Only valid with C parser.
  266. quotechar : str (length 1), optional
  267. The character used to denote the start and end of a quoted item. Quoted
  268. items can include the delimiter and it will be ignored.
  269. quoting : int or csv.QUOTE_* instance, default 0
  270. Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
  271. QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
  272. doublequote : bool, default ``True``
  273. When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
  274. whether or not to interpret two consecutive quotechar elements INSIDE a
  275. field as a single ``quotechar`` element.
  276. escapechar : str (length 1), optional
  277. One-character string used to escape other characters.
  278. comment : str, optional
  279. Indicates remainder of line should not be parsed. If found at the beginning
  280. of a line, the line will be ignored altogether. This parameter must be a
  281. single character. Like empty lines (as long as ``skip_blank_lines=True``),
  282. fully commented lines are ignored by the parameter `header` but not by
  283. `skiprows`. For example, if ``comment='#'``, parsing
  284. ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
  285. treated as the header.
  286. encoding : str, optional
  287. Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
  288. standard encodings
  289. <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
  290. dialect : str or csv.Dialect, optional
  291. If provided, this parameter will override values (default or not) for the
  292. following parameters: `delimiter`, `doublequote`, `escapechar`,
  293. `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
  294. override values, a ParserWarning will be issued. See csv.Dialect
  295. documentation for more details.
  296. error_bad_lines : bool, default True
  297. Lines with too many fields (e.g. a csv line with too many commas) will by
  298. default cause an exception to be raised, and no DataFrame will be returned.
  299. If False, then these "bad lines" will dropped from the DataFrame that is
  300. returned.
  301. warn_bad_lines : bool, default True
  302. If error_bad_lines is False, and warn_bad_lines is True, a warning for each
  303. "bad line" will be output.
  304. delim_whitespace : bool, default False
  305. Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
  306. used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
  307. is set to True, nothing should be passed in for the ``delimiter``
  308. parameter.
  309. low_memory : bool, default True
  310. Internally process the file in chunks, resulting in lower memory use
  311. while parsing, but possibly mixed type inference. To ensure no mixed
  312. types either set False, or specify the type with the `dtype` parameter.
  313. Note that the entire file is read into a single DataFrame regardless,
  314. use the `chunksize` or `iterator` parameter to return the data in chunks.
  315. (Only valid with C parser).
  316. memory_map : bool, default False
  317. If a filepath is provided for `filepath_or_buffer`, map the file object
  318. directly onto memory and access the data directly from there. Using this
  319. option can improve performance because there is no longer any I/O overhead.
  320. float_precision : str, optional
  321. Specifies which converter the C engine should use for floating-point
  322. values. The options are `None` for the ordinary converter,
  323. `high` for the high-precision converter, and `round_trip` for the
  324. round-trip converter.
  325. Returns
  326. -------
  327. DataFrame or TextParser
  328. A comma-separated values (csv) file is returned as two-dimensional
  329. data structure with labeled axes.
  330. See Also
  331. --------
  332. to_csv : Write DataFrame to a comma-separated values (csv) file.
  333. read_csv : Read a comma-separated values (csv) file into DataFrame.
  334. read_fwf : Read a table of fixed-width formatted lines into DataFrame.
  335. Examples
  336. --------
  337. >>> pd.{func_name}('data.csv') # doctest: +SKIP
  338. """
  339. )
  340. def _validate_integer(name, val, min_val=0):
  341. """
  342. Checks whether the 'name' parameter for parsing is either
  343. an integer OR float that can SAFELY be cast to an integer
  344. without losing accuracy. Raises a ValueError if that is
  345. not the case.
  346. Parameters
  347. ----------
  348. name : string
  349. Parameter name (used for error reporting)
  350. val : int or float
  351. The value to check
  352. min_val : int
  353. Minimum allowed value (val < min_val will result in a ValueError)
  354. """
  355. msg = f"'{name:s}' must be an integer >={min_val:d}"
  356. if val is not None:
  357. if is_float(val):
  358. if int(val) != val:
  359. raise ValueError(msg)
  360. val = int(val)
  361. elif not (is_integer(val) and val >= min_val):
  362. raise ValueError(msg)
  363. return val
  364. def _validate_names(names):
  365. """
  366. Raise ValueError if the `names` parameter contains duplicates.
  367. Parameters
  368. ----------
  369. names : array-like or None
  370. An array containing a list of the names used for the output DataFrame.
  371. Raises
  372. ------
  373. ValueError
  374. If names are not unique.
  375. """
  376. if names is not None:
  377. if len(names) != len(set(names)):
  378. raise ValueError("Duplicate names are not allowed.")
  379. def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
  380. """Generic reader of line files."""
  381. encoding = kwds.get("encoding", None)
  382. if encoding is not None:
  383. encoding = re.sub("_", "-", encoding).lower()
  384. kwds["encoding"] = encoding
  385. compression = kwds.get("compression", "infer")
  386. compression = infer_compression(filepath_or_buffer, compression)
  387. # TODO: get_filepath_or_buffer could return
  388. # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile]
  389. # though mypy handling of conditional imports is difficult.
  390. # See https://github.com/python/mypy/issues/1297
  391. fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
  392. filepath_or_buffer, encoding, compression
  393. )
  394. kwds["compression"] = compression
  395. if kwds.get("date_parser", None) is not None:
  396. if isinstance(kwds["parse_dates"], bool):
  397. kwds["parse_dates"] = True
  398. # Extract some of the arguments (pass chunksize on).
  399. iterator = kwds.get("iterator", False)
  400. chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
  401. nrows = kwds.get("nrows", None)
  402. # Check for duplicates in names.
  403. _validate_names(kwds.get("names", None))
  404. # Create the parser.
  405. parser = TextFileReader(fp_or_buf, **kwds)
  406. if chunksize or iterator:
  407. return parser
  408. try:
  409. data = parser.read(nrows)
  410. finally:
  411. parser.close()
  412. if should_close:
  413. try:
  414. fp_or_buf.close()
  415. except ValueError:
  416. pass
  417. return data
  418. _parser_defaults = {
  419. "delimiter": None,
  420. "escapechar": None,
  421. "quotechar": '"',
  422. "quoting": csv.QUOTE_MINIMAL,
  423. "doublequote": True,
  424. "skipinitialspace": False,
  425. "lineterminator": None,
  426. "header": "infer",
  427. "index_col": None,
  428. "names": None,
  429. "prefix": None,
  430. "skiprows": None,
  431. "skipfooter": 0,
  432. "nrows": None,
  433. "na_values": None,
  434. "keep_default_na": True,
  435. "true_values": None,
  436. "false_values": None,
  437. "converters": None,
  438. "dtype": None,
  439. "cache_dates": True,
  440. "thousands": None,
  441. "comment": None,
  442. "decimal": ".",
  443. # 'engine': 'c',
  444. "parse_dates": False,
  445. "keep_date_col": False,
  446. "dayfirst": False,
  447. "date_parser": None,
  448. "usecols": None,
  449. # 'iterator': False,
  450. "chunksize": None,
  451. "verbose": False,
  452. "encoding": None,
  453. "squeeze": False,
  454. "compression": None,
  455. "mangle_dupe_cols": True,
  456. "infer_datetime_format": False,
  457. "skip_blank_lines": True,
  458. }
  459. _c_parser_defaults = {
  460. "delim_whitespace": False,
  461. "na_filter": True,
  462. "low_memory": True,
  463. "memory_map": False,
  464. "error_bad_lines": True,
  465. "warn_bad_lines": True,
  466. "float_precision": None,
  467. }
  468. _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
  469. _c_unsupported = {"skipfooter"}
  470. _python_unsupported = {"low_memory", "float_precision"}
  471. _deprecated_defaults: Dict[str, Any] = {}
  472. _deprecated_args: Set[str] = set()
  473. def _make_parser_function(name, default_sep=","):
  474. def parser_f(
  475. filepath_or_buffer: FilePathOrBuffer,
  476. sep=default_sep,
  477. delimiter=None,
  478. # Column and Index Locations and Names
  479. header="infer",
  480. names=None,
  481. index_col=None,
  482. usecols=None,
  483. squeeze=False,
  484. prefix=None,
  485. mangle_dupe_cols=True,
  486. # General Parsing Configuration
  487. dtype=None,
  488. engine=None,
  489. converters=None,
  490. true_values=None,
  491. false_values=None,
  492. skipinitialspace=False,
  493. skiprows=None,
  494. skipfooter=0,
  495. nrows=None,
  496. # NA and Missing Data Handling
  497. na_values=None,
  498. keep_default_na=True,
  499. na_filter=True,
  500. verbose=False,
  501. skip_blank_lines=True,
  502. # Datetime Handling
  503. parse_dates=False,
  504. infer_datetime_format=False,
  505. keep_date_col=False,
  506. date_parser=None,
  507. dayfirst=False,
  508. cache_dates=True,
  509. # Iteration
  510. iterator=False,
  511. chunksize=None,
  512. # Quoting, Compression, and File Format
  513. compression="infer",
  514. thousands=None,
  515. decimal: str = ".",
  516. lineterminator=None,
  517. quotechar='"',
  518. quoting=csv.QUOTE_MINIMAL,
  519. doublequote=True,
  520. escapechar=None,
  521. comment=None,
  522. encoding=None,
  523. dialect=None,
  524. # Error Handling
  525. error_bad_lines=True,
  526. warn_bad_lines=True,
  527. # Internal
  528. delim_whitespace=False,
  529. low_memory=_c_parser_defaults["low_memory"],
  530. memory_map=False,
  531. float_precision=None,
  532. ):
  533. # gh-23761
  534. #
  535. # When a dialect is passed, it overrides any of the overlapping
  536. # parameters passed in directly. We don't want to warn if the
  537. # default parameters were passed in (since it probably means
  538. # that the user didn't pass them in explicitly in the first place).
  539. #
  540. # "delimiter" is the annoying corner case because we alias it to
  541. # "sep" before doing comparison to the dialect values later on.
  542. # Thus, we need a flag to indicate that we need to "override"
  543. # the comparison to dialect values by checking if default values
  544. # for BOTH "delimiter" and "sep" were provided.
  545. if dialect is not None:
  546. sep_override = delimiter is None and sep == default_sep
  547. kwds = dict(sep_override=sep_override)
  548. else:
  549. kwds = dict()
  550. # Alias sep -> delimiter.
  551. if delimiter is None:
  552. delimiter = sep
  553. if delim_whitespace and delimiter != default_sep:
  554. raise ValueError(
  555. "Specified a delimiter with both sep and "
  556. "delim_whitespace=True; you can only "
  557. "specify one."
  558. )
  559. if engine is not None:
  560. engine_specified = True
  561. else:
  562. engine = "c"
  563. engine_specified = False
  564. kwds.update(
  565. delimiter=delimiter,
  566. engine=engine,
  567. dialect=dialect,
  568. compression=compression,
  569. engine_specified=engine_specified,
  570. doublequote=doublequote,
  571. escapechar=escapechar,
  572. quotechar=quotechar,
  573. quoting=quoting,
  574. skipinitialspace=skipinitialspace,
  575. lineterminator=lineterminator,
  576. header=header,
  577. index_col=index_col,
  578. names=names,
  579. prefix=prefix,
  580. skiprows=skiprows,
  581. skipfooter=skipfooter,
  582. na_values=na_values,
  583. true_values=true_values,
  584. false_values=false_values,
  585. keep_default_na=keep_default_na,
  586. thousands=thousands,
  587. comment=comment,
  588. decimal=decimal,
  589. parse_dates=parse_dates,
  590. keep_date_col=keep_date_col,
  591. dayfirst=dayfirst,
  592. date_parser=date_parser,
  593. cache_dates=cache_dates,
  594. nrows=nrows,
  595. iterator=iterator,
  596. chunksize=chunksize,
  597. converters=converters,
  598. dtype=dtype,
  599. usecols=usecols,
  600. verbose=verbose,
  601. encoding=encoding,
  602. squeeze=squeeze,
  603. memory_map=memory_map,
  604. float_precision=float_precision,
  605. na_filter=na_filter,
  606. delim_whitespace=delim_whitespace,
  607. warn_bad_lines=warn_bad_lines,
  608. error_bad_lines=error_bad_lines,
  609. low_memory=low_memory,
  610. mangle_dupe_cols=mangle_dupe_cols,
  611. infer_datetime_format=infer_datetime_format,
  612. skip_blank_lines=skip_blank_lines,
  613. )
  614. return _read(filepath_or_buffer, kwds)
  615. parser_f.__name__ = name
  616. return parser_f
  617. read_csv = _make_parser_function("read_csv", default_sep=",")
  618. read_csv = Appender(
  619. _doc_read_csv_and_table.format(
  620. func_name="read_csv",
  621. summary="Read a comma-separated values (csv) file into DataFrame.",
  622. _default_sep="','",
  623. )
  624. )(read_csv)
  625. read_table = _make_parser_function("read_table", default_sep="\t")
  626. read_table = Appender(
  627. _doc_read_csv_and_table.format(
  628. func_name="read_table",
  629. summary="Read general delimited file into DataFrame.",
  630. _default_sep=r"'\\t' (tab-stop)",
  631. )
  632. )(read_table)
  633. def read_fwf(
  634. filepath_or_buffer: FilePathOrBuffer,
  635. colspecs="infer",
  636. widths=None,
  637. infer_nrows=100,
  638. **kwds,
  639. ):
  640. r"""
  641. Read a table of fixed-width formatted lines into DataFrame.
  642. Also supports optionally iterating or breaking of the file
  643. into chunks.
  644. Additional help can be found in the `online docs for IO Tools
  645. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
  646. Parameters
  647. ----------
  648. filepath_or_buffer : str, path object or file-like object
  649. Any valid string path is acceptable. The string could be a URL. Valid
  650. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  651. expected. A local file could be:
  652. ``file://localhost/path/to/table.csv``.
  653. If you want to pass in a path object, pandas accepts any
  654. ``os.PathLike``.
  655. By file-like object, we refer to objects with a ``read()`` method,
  656. such as a file handler (e.g. via builtin ``open`` function)
  657. or ``StringIO``.
  658. colspecs : list of tuple (int, int) or 'infer'. optional
  659. A list of tuples giving the extents of the fixed-width
  660. fields of each line as half-open intervals (i.e., [from, to[ ).
  661. String value 'infer' can be used to instruct the parser to try
  662. detecting the column specifications from the first 100 rows of
  663. the data which are not being skipped via skiprows (default='infer').
  664. widths : list of int, optional
  665. A list of field widths which can be used instead of 'colspecs' if
  666. the intervals are contiguous.
  667. infer_nrows : int, default 100
  668. The number of rows to consider when letting the parser determine the
  669. `colspecs`.
  670. .. versionadded:: 0.24.0
  671. **kwds : optional
  672. Optional keyword arguments can be passed to ``TextFileReader``.
  673. Returns
  674. -------
  675. DataFrame or TextParser
  676. A comma-separated values (csv) file is returned as two-dimensional
  677. data structure with labeled axes.
  678. See Also
  679. --------
  680. to_csv : Write DataFrame to a comma-separated values (csv) file.
  681. read_csv : Read a comma-separated values (csv) file into DataFrame.
  682. Examples
  683. --------
  684. >>> pd.read_fwf('data.csv') # doctest: +SKIP
  685. """
  686. # Check input arguments.
  687. if colspecs is None and widths is None:
  688. raise ValueError("Must specify either colspecs or widths")
  689. elif colspecs not in (None, "infer") and widths is not None:
  690. raise ValueError("You must specify only one of 'widths' and 'colspecs'")
  691. # Compute 'colspecs' from 'widths', if specified.
  692. if widths is not None:
  693. colspecs, col = [], 0
  694. for w in widths:
  695. colspecs.append((col, col + w))
  696. col += w
  697. kwds["colspecs"] = colspecs
  698. kwds["infer_nrows"] = infer_nrows
  699. kwds["engine"] = "python-fwf"
  700. return _read(filepath_or_buffer, kwds)
  701. class TextFileReader(abc.Iterator):
  702. """
  703. Passed dialect overrides any of the related parser options
  704. """
  705. def __init__(self, f, engine=None, **kwds):
  706. self.f = f
  707. if engine is not None:
  708. engine_specified = True
  709. else:
  710. engine = "python"
  711. engine_specified = False
  712. self._engine_specified = kwds.get("engine_specified", engine_specified)
  713. if kwds.get("dialect") is not None:
  714. dialect = kwds["dialect"]
  715. if dialect in csv.list_dialects():
  716. dialect = csv.get_dialect(dialect)
  717. # Any valid dialect should have these attributes.
  718. # If any are missing, we will raise automatically.
  719. for param in (
  720. "delimiter",
  721. "doublequote",
  722. "escapechar",
  723. "skipinitialspace",
  724. "quotechar",
  725. "quoting",
  726. ):
  727. try:
  728. dialect_val = getattr(dialect, param)
  729. except AttributeError:
  730. raise ValueError(f"Invalid dialect {kwds['dialect']} provided")
  731. parser_default = _parser_defaults[param]
  732. provided = kwds.get(param, parser_default)
  733. # Messages for conflicting values between the dialect
  734. # instance and the actual parameters provided.
  735. conflict_msgs = []
  736. # Don't warn if the default parameter was passed in,
  737. # even if it conflicts with the dialect (gh-23761).
  738. if provided != parser_default and provided != dialect_val:
  739. msg = (
  740. f"Conflicting values for '{param}': '{provided}' was "
  741. f"provided, but the dialect specifies '{dialect_val}'. "
  742. "Using the dialect-specified value."
  743. )
  744. # Annoying corner case for not warning about
  745. # conflicts between dialect and delimiter parameter.
  746. # Refer to the outer "_read_" function for more info.
  747. if not (param == "delimiter" and kwds.pop("sep_override", False)):
  748. conflict_msgs.append(msg)
  749. if conflict_msgs:
  750. warnings.warn(
  751. "\n\n".join(conflict_msgs), ParserWarning, stacklevel=2
  752. )
  753. kwds[param] = dialect_val
  754. if kwds.get("skipfooter"):
  755. if kwds.get("iterator") or kwds.get("chunksize"):
  756. raise ValueError("'skipfooter' not supported for 'iteration'")
  757. if kwds.get("nrows"):
  758. raise ValueError("'skipfooter' not supported with 'nrows'")
  759. if kwds.get("header", "infer") == "infer":
  760. kwds["header"] = 0 if kwds.get("names") is None else None
  761. self.orig_options = kwds
  762. # miscellanea
  763. self.engine = engine
  764. self._engine = None
  765. self._currow = 0
  766. options = self._get_options_with_defaults(engine)
  767. self.chunksize = options.pop("chunksize", None)
  768. self.nrows = options.pop("nrows", None)
  769. self.squeeze = options.pop("squeeze", False)
  770. # might mutate self.engine
  771. self.engine = self._check_file_or_buffer(f, engine)
  772. self.options, self.engine = self._clean_options(options, engine)
  773. if "has_index_names" in kwds:
  774. self.options["has_index_names"] = kwds["has_index_names"]
  775. self._make_engine(self.engine)
  776. def close(self):
  777. self._engine.close()
  778. def _get_options_with_defaults(self, engine):
  779. kwds = self.orig_options
  780. options = {}
  781. for argname, default in _parser_defaults.items():
  782. value = kwds.get(argname, default)
  783. # see gh-12935
  784. if argname == "mangle_dupe_cols" and not value:
  785. raise ValueError("Setting mangle_dupe_cols=False is not supported yet")
  786. else:
  787. options[argname] = value
  788. for argname, default in _c_parser_defaults.items():
  789. if argname in kwds:
  790. value = kwds[argname]
  791. if engine != "c" and value != default:
  792. if "python" in engine and argname not in _python_unsupported:
  793. pass
  794. elif value == _deprecated_defaults.get(argname, default):
  795. pass
  796. else:
  797. raise ValueError(
  798. f"The {repr(argname)} option is not supported with the"
  799. f" {repr(engine)} engine"
  800. )
  801. else:
  802. value = _deprecated_defaults.get(argname, default)
  803. options[argname] = value
  804. if engine == "python-fwf":
  805. for argname, default in _fwf_defaults.items():
  806. options[argname] = kwds.get(argname, default)
  807. return options
  808. def _check_file_or_buffer(self, f, engine):
  809. # see gh-16530
  810. if is_file_like(f):
  811. next_attr = "__next__"
  812. # The C engine doesn't need the file-like to have the "next" or
  813. # "__next__" attribute. However, the Python engine explicitly calls
  814. # "next(...)" when iterating through such an object, meaning it
  815. # needs to have that attribute ("next" for Python 2.x, "__next__"
  816. # for Python 3.x)
  817. if engine != "c" and not hasattr(f, next_attr):
  818. msg = "The 'python' engine cannot iterate through this file buffer."
  819. raise ValueError(msg)
  820. return engine
  821. def _clean_options(self, options, engine):
  822. result = options.copy()
  823. engine_specified = self._engine_specified
  824. fallback_reason = None
  825. sep = options["delimiter"]
  826. delim_whitespace = options["delim_whitespace"]
  827. # C engine not supported yet
  828. if engine == "c":
  829. if options["skipfooter"] > 0:
  830. fallback_reason = "the 'c' engine does not support skipfooter"
  831. engine = "python"
  832. encoding = sys.getfilesystemencoding() or "utf-8"
  833. if sep is None and not delim_whitespace:
  834. if engine == "c":
  835. fallback_reason = (
  836. "the 'c' engine does not support "
  837. "sep=None with delim_whitespace=False"
  838. )
  839. engine = "python"
  840. elif sep is not None and len(sep) > 1:
  841. if engine == "c" and sep == r"\s+":
  842. result["delim_whitespace"] = True
  843. del result["delimiter"]
  844. elif engine not in ("python", "python-fwf"):
  845. # wait until regex engine integrated
  846. fallback_reason = (
  847. "the 'c' engine does not support "
  848. "regex separators (separators > 1 char and "
  849. r"different from '\s+' are "
  850. "interpreted as regex)"
  851. )
  852. engine = "python"
  853. elif delim_whitespace:
  854. if "python" in engine:
  855. result["delimiter"] = r"\s+"
  856. elif sep is not None:
  857. encodeable = True
  858. try:
  859. if len(sep.encode(encoding)) > 1:
  860. encodeable = False
  861. except UnicodeDecodeError:
  862. encodeable = False
  863. if not encodeable and engine not in ("python", "python-fwf"):
  864. fallback_reason = (
  865. f"the separator encoded in {encoding} "
  866. "is > 1 char long, and the 'c' engine "
  867. "does not support such separators"
  868. )
  869. engine = "python"
  870. quotechar = options["quotechar"]
  871. if quotechar is not None and isinstance(quotechar, (str, bytes)):
  872. if (
  873. len(quotechar) == 1
  874. and ord(quotechar) > 127
  875. and engine not in ("python", "python-fwf")
  876. ):
  877. fallback_reason = (
  878. "ord(quotechar) > 127, meaning the "
  879. "quotechar is larger than one byte, "
  880. "and the 'c' engine does not support "
  881. "such quotechars"
  882. )
  883. engine = "python"
  884. if fallback_reason and engine_specified:
  885. raise ValueError(fallback_reason)
  886. if engine == "c":
  887. for arg in _c_unsupported:
  888. del result[arg]
  889. if "python" in engine:
  890. for arg in _python_unsupported:
  891. if fallback_reason and result[arg] != _c_parser_defaults[arg]:
  892. raise ValueError(
  893. "Falling back to the 'python' engine because "
  894. f"{fallback_reason}, but this causes {repr(arg)} to be "
  895. "ignored as it is not supported by the 'python' engine."
  896. )
  897. del result[arg]
  898. if fallback_reason:
  899. warnings.warn(
  900. (
  901. "Falling back to the 'python' engine because "
  902. f"{fallback_reason}; you can avoid this warning by specifying "
  903. "engine='python'."
  904. ),
  905. ParserWarning,
  906. stacklevel=5,
  907. )
  908. index_col = options["index_col"]
  909. names = options["names"]
  910. converters = options["converters"]
  911. na_values = options["na_values"]
  912. skiprows = options["skiprows"]
  913. validate_header_arg(options["header"])
  914. depr_warning = ""
  915. for arg in _deprecated_args:
  916. parser_default = _c_parser_defaults[arg]
  917. depr_default = _deprecated_defaults[arg]
  918. msg = (
  919. f"The {repr(arg)} argument has been deprecated and will be "
  920. "removed in a future version."
  921. )
  922. if result.get(arg, depr_default) != depr_default:
  923. depr_warning += msg + "\n\n"
  924. else:
  925. result[arg] = parser_default
  926. if depr_warning != "":
  927. warnings.warn(depr_warning, FutureWarning, stacklevel=2)
  928. if index_col is True:
  929. raise ValueError("The value of index_col couldn't be 'True'")
  930. if _is_index_col(index_col):
  931. if not isinstance(index_col, (list, tuple, np.ndarray)):
  932. index_col = [index_col]
  933. result["index_col"] = index_col
  934. names = list(names) if names is not None else names
  935. # type conversion-related
  936. if converters is not None:
  937. if not isinstance(converters, dict):
  938. raise TypeError(
  939. "Type converters must be a dict or subclass, "
  940. f"input was a {type(converters).__name__}"
  941. )
  942. else:
  943. converters = {}
  944. # Converting values to NA
  945. keep_default_na = options["keep_default_na"]
  946. na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
  947. # handle skiprows; this is internally handled by the
  948. # c-engine, so only need for python parsers
  949. if engine != "c":
  950. if is_integer(skiprows):
  951. skiprows = list(range(skiprows))
  952. if skiprows is None:
  953. skiprows = set()
  954. elif not callable(skiprows):
  955. skiprows = set(skiprows)
  956. # put stuff back
  957. result["names"] = names
  958. result["converters"] = converters
  959. result["na_values"] = na_values
  960. result["na_fvalues"] = na_fvalues
  961. result["skiprows"] = skiprows
  962. return result, engine
  963. def __next__(self):
  964. try:
  965. return self.get_chunk()
  966. except StopIteration:
  967. self.close()
  968. raise
  969. def _make_engine(self, engine="c"):
  970. if engine == "c":
  971. self._engine = CParserWrapper(self.f, **self.options)
  972. else:
  973. if engine == "python":
  974. klass = PythonParser
  975. elif engine == "python-fwf":
  976. klass = FixedWidthFieldParser
  977. else:
  978. raise ValueError(
  979. f"Unknown engine: {engine} (valid options are "
  980. '"c", "python", or '
  981. '"python-fwf")'
  982. )
  983. self._engine = klass(self.f, **self.options)
  984. def _failover_to_python(self):
  985. raise AbstractMethodError(self)
  986. def read(self, nrows=None):
  987. nrows = _validate_integer("nrows", nrows)
  988. ret = self._engine.read(nrows)
  989. # May alter columns / col_dict
  990. index, columns, col_dict = self._create_index(ret)
  991. if index is None:
  992. if col_dict:
  993. # Any column is actually fine:
  994. new_rows = len(next(iter(col_dict.values())))
  995. index = RangeIndex(self._currow, self._currow + new_rows)
  996. else:
  997. new_rows = 0
  998. else:
  999. new_rows = len(index)
  1000. df = DataFrame(col_dict, columns=columns, index=index)
  1001. self._currow += new_rows
  1002. if self.squeeze and len(df.columns) == 1:
  1003. return df[df.columns[0]].copy()
  1004. return df
  1005. def _create_index(self, ret):
  1006. index, columns, col_dict = ret
  1007. return index, columns, col_dict
  1008. def get_chunk(self, size=None):
  1009. if size is None:
  1010. size = self.chunksize
  1011. if self.nrows is not None:
  1012. if self._currow >= self.nrows:
  1013. raise StopIteration
  1014. size = min(size, self.nrows - self._currow)
  1015. return self.read(nrows=size)
  1016. def _is_index_col(col):
  1017. return col is not None and col is not False
  1018. def _is_potential_multi_index(columns):
  1019. """
  1020. Check whether or not the `columns` parameter
  1021. could be converted into a MultiIndex.
  1022. Parameters
  1023. ----------
  1024. columns : array-like
  1025. Object which may or may not be convertible into a MultiIndex
  1026. Returns
  1027. -------
  1028. boolean : Whether or not columns could become a MultiIndex
  1029. """
  1030. return (
  1031. len(columns)
  1032. and not isinstance(columns, MultiIndex)
  1033. and all(isinstance(c, tuple) for c in columns)
  1034. )
  1035. def _evaluate_usecols(usecols, names):
  1036. """
  1037. Check whether or not the 'usecols' parameter
  1038. is a callable. If so, enumerates the 'names'
  1039. parameter and returns a set of indices for
  1040. each entry in 'names' that evaluates to True.
  1041. If not a callable, returns 'usecols'.
  1042. """
  1043. if callable(usecols):
  1044. return {i for i, name in enumerate(names) if usecols(name)}
  1045. return usecols
  1046. def _validate_usecols_names(usecols, names):
  1047. """
  1048. Validates that all usecols are present in a given
  1049. list of names. If not, raise a ValueError that
  1050. shows what usecols are missing.
  1051. Parameters
  1052. ----------
  1053. usecols : iterable of usecols
  1054. The columns to validate are present in names.
  1055. names : iterable of names
  1056. The column names to check against.
  1057. Returns
  1058. -------
  1059. usecols : iterable of usecols
  1060. The `usecols` parameter if the validation succeeds.
  1061. Raises
  1062. ------
  1063. ValueError : Columns were missing. Error message will list them.
  1064. """
  1065. missing = [c for c in usecols if c not in names]
  1066. if len(missing) > 0:
  1067. raise ValueError(
  1068. "Usecols do not match columns, "
  1069. f"columns expected but not found: {missing}"
  1070. )
  1071. return usecols
  1072. def _validate_skipfooter_arg(skipfooter):
  1073. """
  1074. Validate the 'skipfooter' parameter.
  1075. Checks whether 'skipfooter' is a non-negative integer.
  1076. Raises a ValueError if that is not the case.
  1077. Parameters
  1078. ----------
  1079. skipfooter : non-negative integer
  1080. The number of rows to skip at the end of the file.
  1081. Returns
  1082. -------
  1083. validated_skipfooter : non-negative integer
  1084. The original input if the validation succeeds.
  1085. Raises
  1086. ------
  1087. ValueError : 'skipfooter' was not a non-negative integer.
  1088. """
  1089. if not is_integer(skipfooter):
  1090. raise ValueError("skipfooter must be an integer")
  1091. if skipfooter < 0:
  1092. raise ValueError("skipfooter cannot be negative")
  1093. return skipfooter
  1094. def _validate_usecols_arg(usecols):
  1095. """
  1096. Validate the 'usecols' parameter.
  1097. Checks whether or not the 'usecols' parameter contains all integers
  1098. (column selection by index), strings (column by name) or is a callable.
  1099. Raises a ValueError if that is not the case.
  1100. Parameters
  1101. ----------
  1102. usecols : list-like, callable, or None
  1103. List of columns to use when parsing or a callable that can be used
  1104. to filter a list of table columns.
  1105. Returns
  1106. -------
  1107. usecols_tuple : tuple
  1108. A tuple of (verified_usecols, usecols_dtype).
  1109. 'verified_usecols' is either a set if an array-like is passed in or
  1110. 'usecols' if a callable or None is passed in.
  1111. 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
  1112. is passed in or None if a callable or None is passed in.
  1113. """
  1114. msg = (
  1115. "'usecols' must either be list-like of all strings, all unicode, "
  1116. "all integers or a callable."
  1117. )
  1118. if usecols is not None:
  1119. if callable(usecols):
  1120. return usecols, None
  1121. if not is_list_like(usecols):
  1122. # see gh-20529
  1123. #
  1124. # Ensure it is iterable container but not string.
  1125. raise ValueError(msg)
  1126. usecols_dtype = lib.infer_dtype(usecols, skipna=False)
  1127. if usecols_dtype not in ("empty", "integer", "string", "unicode"):
  1128. raise ValueError(msg)
  1129. usecols = set(usecols)
  1130. return usecols, usecols_dtype
  1131. return usecols, None
  1132. def _validate_parse_dates_arg(parse_dates):
  1133. """
  1134. Check whether or not the 'parse_dates' parameter
  1135. is a non-boolean scalar. Raises a ValueError if
  1136. that is the case.
  1137. """
  1138. msg = (
  1139. "Only booleans, lists, and "
  1140. "dictionaries are accepted "
  1141. "for the 'parse_dates' parameter"
  1142. )
  1143. if parse_dates is not None:
  1144. if is_scalar(parse_dates):
  1145. if not lib.is_bool(parse_dates):
  1146. raise TypeError(msg)
  1147. elif not isinstance(parse_dates, (list, dict)):
  1148. raise TypeError(msg)
  1149. return parse_dates
  1150. class ParserBase:
  1151. def __init__(self, kwds):
  1152. self.names = kwds.get("names")
  1153. self.orig_names = None
  1154. self.prefix = kwds.pop("prefix", None)
  1155. self.index_col = kwds.get("index_col", None)
  1156. self.unnamed_cols = set()
  1157. self.index_names = None
  1158. self.col_names = None
  1159. self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
  1160. self.date_parser = kwds.pop("date_parser", None)
  1161. self.dayfirst = kwds.pop("dayfirst", False)
  1162. self.keep_date_col = kwds.pop("keep_date_col", False)
  1163. self.na_values = kwds.get("na_values")
  1164. self.na_fvalues = kwds.get("na_fvalues")
  1165. self.na_filter = kwds.get("na_filter", False)
  1166. self.keep_default_na = kwds.get("keep_default_na", True)
  1167. self.true_values = kwds.get("true_values")
  1168. self.false_values = kwds.get("false_values")
  1169. self.mangle_dupe_cols = kwds.get("mangle_dupe_cols", True)
  1170. self.infer_datetime_format = kwds.pop("infer_datetime_format", False)
  1171. self.cache_dates = kwds.pop("cache_dates", True)
  1172. self._date_conv = _make_date_converter(
  1173. date_parser=self.date_parser,
  1174. dayfirst=self.dayfirst,
  1175. infer_datetime_format=self.infer_datetime_format,
  1176. cache_dates=self.cache_dates,
  1177. )
  1178. # validate header options for mi
  1179. self.header = kwds.get("header")
  1180. if isinstance(self.header, (list, tuple, np.ndarray)):
  1181. if not all(map(is_integer, self.header)):
  1182. raise ValueError("header must be integer or list of integers")
  1183. if any(i < 0 for i in self.header):
  1184. raise ValueError(
  1185. "cannot specify multi-index header with negative integers"
  1186. )
  1187. if kwds.get("usecols"):
  1188. raise ValueError(
  1189. "cannot specify usecols when specifying a multi-index header"
  1190. )
  1191. if kwds.get("names"):
  1192. raise ValueError(
  1193. "cannot specify names when specifying a multi-index header"
  1194. )
  1195. # validate index_col that only contains integers
  1196. if self.index_col is not None:
  1197. is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray))
  1198. if not (
  1199. is_sequence
  1200. and all(map(is_integer, self.index_col))
  1201. or is_integer(self.index_col)
  1202. ):
  1203. raise ValueError(
  1204. "index_col must only contain row numbers "
  1205. "when specifying a multi-index header"
  1206. )
  1207. # GH 16338
  1208. elif self.header is not None and not is_integer(self.header):
  1209. raise ValueError("header must be integer or list of integers")
  1210. # GH 27779
  1211. elif self.header is not None and self.header < 0:
  1212. raise ValueError(
  1213. "Passing negative integer to header is invalid. "
  1214. "For no header, use header=None instead"
  1215. )
  1216. self._name_processed = False
  1217. self._first_chunk = True
  1218. # GH 13932
  1219. # keep references to file handles opened by the parser itself
  1220. self.handles = []
  1221. def close(self):
  1222. for f in self.handles:
  1223. f.close()
  1224. @property
  1225. def _has_complex_date_col(self):
  1226. return isinstance(self.parse_dates, dict) or (
  1227. isinstance(self.parse_dates, list)
  1228. and len(self.parse_dates) > 0
  1229. and isinstance(self.parse_dates[0], list)
  1230. )
  1231. def _should_parse_dates(self, i):
  1232. if isinstance(self.parse_dates, bool):
  1233. return self.parse_dates
  1234. else:
  1235. if self.index_names is not None:
  1236. name = self.index_names[i]
  1237. else:
  1238. name = None
  1239. j = self.index_col[i]
  1240. if is_scalar(self.parse_dates):
  1241. return (j == self.parse_dates) or (
  1242. name is not None and name == self.parse_dates
  1243. )
  1244. else:
  1245. return (j in self.parse_dates) or (
  1246. name is not None and name in self.parse_dates
  1247. )
  1248. def _extract_multi_indexer_columns(
  1249. self, header, index_names, col_names, passed_names=False
  1250. ):
  1251. """ extract and return the names, index_names, col_names
  1252. header is a list-of-lists returned from the parsers """
  1253. if len(header) < 2:
  1254. return header[0], index_names, col_names, passed_names
  1255. # the names are the tuples of the header that are not the index cols
  1256. # 0 is the name of the index, assuming index_col is a list of column
  1257. # numbers
  1258. ic = self.index_col
  1259. if ic is None:
  1260. ic = []
  1261. if not isinstance(ic, (list, tuple, np.ndarray)):
  1262. ic = [ic]
  1263. sic = set(ic)
  1264. # clean the index_names
  1265. index_names = header.pop(-1)
  1266. index_names, names, index_col = _clean_index_names(
  1267. index_names, self.index_col, self.unnamed_cols
  1268. )
  1269. # extract the columns
  1270. field_count = len(header[0])
  1271. def extract(r):
  1272. return tuple(r[i] for i in range(field_count) if i not in sic)
  1273. columns = list(zip(*(extract(r) for r in header)))
  1274. names = ic + columns
  1275. # If we find unnamed columns all in a single
  1276. # level, then our header was too long.
  1277. for n in range(len(columns[0])):
  1278. if all(ensure_str(col[n]) in self.unnamed_cols for col in columns):
  1279. raise ParserError(
  1280. "Passed header=[{header}] are too many rows for this "
  1281. "multi_index of columns".format(
  1282. header=",".join(str(x) for x in self.header)
  1283. )
  1284. )
  1285. # Clean the column names (if we have an index_col).
  1286. if len(ic):
  1287. col_names = [
  1288. r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None
  1289. for r in header
  1290. ]
  1291. else:
  1292. col_names = [None] * len(header)
  1293. passed_names = True
  1294. return names, index_names, col_names, passed_names
  1295. def _maybe_dedup_names(self, names):
  1296. # see gh-7160 and gh-9424: this helps to provide
  1297. # immediate alleviation of the duplicate names
  1298. # issue and appears to be satisfactory to users,
  1299. # but ultimately, not needing to butcher the names
  1300. # would be nice!
  1301. if self.mangle_dupe_cols:
  1302. names = list(names) # so we can index
  1303. counts = defaultdict(int)
  1304. is_potential_mi = _is_potential_multi_index(names)
  1305. for i, col in enumerate(names):
  1306. cur_count = counts[col]
  1307. while cur_count > 0:
  1308. counts[col] = cur_count + 1
  1309. if is_potential_mi:
  1310. col = col[:-1] + (f"{col[-1]}.{cur_count}",)
  1311. else:
  1312. col = f"{col}.{cur_count}"
  1313. cur_count = counts[col]
  1314. names[i] = col
  1315. counts[col] = cur_count + 1
  1316. return names
  1317. def _maybe_make_multi_index_columns(self, columns, col_names=None):
  1318. # possibly create a column mi here
  1319. if _is_potential_multi_index(columns):
  1320. columns = MultiIndex.from_tuples(columns, names=col_names)
  1321. return columns
  1322. def _make_index(self, data, alldata, columns, indexnamerow=False):
  1323. if not _is_index_col(self.index_col) or not self.index_col:
  1324. index = None
  1325. elif not self._has_complex_date_col:
  1326. index = self._get_simple_index(alldata, columns)
  1327. index = self._agg_index(index)
  1328. elif self._has_complex_date_col:
  1329. if not self._name_processed:
  1330. (self.index_names, _, self.index_col) = _clean_index_names(
  1331. list(columns), self.index_col, self.unnamed_cols
  1332. )
  1333. self._name_processed = True
  1334. index = self._get_complex_date_index(data, columns)
  1335. index = self._agg_index(index, try_parse_dates=False)
  1336. # add names for the index
  1337. if indexnamerow:
  1338. coffset = len(indexnamerow) - len(columns)
  1339. index = index.set_names(indexnamerow[:coffset])
  1340. # maybe create a mi on the columns
  1341. columns = self._maybe_make_multi_index_columns(columns, self.col_names)
  1342. return index, columns
  1343. _implicit_index = False
  1344. def _get_simple_index(self, data, columns):
  1345. def ix(col):
  1346. if not isinstance(col, str):
  1347. return col
  1348. raise ValueError(f"Index {col} invalid")
  1349. to_remove = []
  1350. index = []
  1351. for idx in self.index_col:
  1352. i = ix(idx)
  1353. to_remove.append(i)
  1354. index.append(data[i])
  1355. # remove index items from content and columns, don't pop in
  1356. # loop
  1357. for i in sorted(to_remove, reverse=True):
  1358. data.pop(i)
  1359. if not self._implicit_index:
  1360. columns.pop(i)
  1361. return index
  1362. def _get_complex_date_index(self, data, col_names):
  1363. def _get_name(icol):
  1364. if isinstance(icol, str):
  1365. return icol
  1366. if col_names is None:
  1367. raise ValueError(f"Must supply column order to use {icol!s} as index")
  1368. for i, c in enumerate(col_names):
  1369. if i == icol:
  1370. return c
  1371. to_remove = []
  1372. index = []
  1373. for idx in self.index_col:
  1374. name = _get_name(idx)
  1375. to_remove.append(name)
  1376. index.append(data[name])
  1377. # remove index items from content and columns, don't pop in
  1378. # loop
  1379. for c in sorted(to_remove, reverse=True):
  1380. data.pop(c)
  1381. col_names.remove(c)
  1382. return index
  1383. def _agg_index(self, index, try_parse_dates=True):
  1384. arrays = []
  1385. for i, arr in enumerate(index):
  1386. if try_parse_dates and self._should_parse_dates(i):
  1387. arr = self._date_conv(arr)
  1388. if self.na_filter:
  1389. col_na_values = self.na_values
  1390. col_na_fvalues = self.na_fvalues
  1391. else:
  1392. col_na_values = set()
  1393. col_na_fvalues = set()
  1394. if isinstance(self.na_values, dict):
  1395. col_name = self.index_names[i]
  1396. if col_name is not None:
  1397. col_na_values, col_na_fvalues = _get_na_values(
  1398. col_name, self.na_values, self.na_fvalues, self.keep_default_na
  1399. )
  1400. arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues)
  1401. arrays.append(arr)
  1402. names = self.index_names
  1403. index = ensure_index_from_sequences(arrays, names)
  1404. return index
  1405. def _convert_to_ndarrays(
  1406. self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None
  1407. ):
  1408. result = {}
  1409. for c, values in dct.items():
  1410. conv_f = None if converters is None else converters.get(c, None)
  1411. if isinstance(dtypes, dict):
  1412. cast_type = dtypes.get(c, None)
  1413. else:
  1414. # single dtype or None
  1415. cast_type = dtypes
  1416. if self.na_filter:
  1417. col_na_values, col_na_fvalues = _get_na_values(
  1418. c, na_values, na_fvalues, self.keep_default_na
  1419. )
  1420. else:
  1421. col_na_values, col_na_fvalues = set(), set()
  1422. if conv_f is not None:
  1423. # conv_f applied to data before inference
  1424. if cast_type is not None:
  1425. warnings.warn(
  1426. (
  1427. "Both a converter and dtype were specified "
  1428. f"for column {c} - only the converter will "
  1429. "be used"
  1430. ),
  1431. ParserWarning,
  1432. stacklevel=7,
  1433. )
  1434. try:
  1435. values = lib.map_infer(values, conv_f)
  1436. except ValueError:
  1437. mask = algorithms.isin(values, list(na_values)).view(np.uint8)
  1438. values = lib.map_infer_mask(values, conv_f, mask)
  1439. cvals, na_count = self._infer_types(
  1440. values, set(col_na_values) | col_na_fvalues, try_num_bool=False
  1441. )
  1442. else:
  1443. is_str_or_ea_dtype = is_string_dtype(
  1444. cast_type
  1445. ) or is_extension_array_dtype(cast_type)
  1446. # skip inference if specified dtype is object
  1447. # or casting to an EA
  1448. try_num_bool = not (cast_type and is_str_or_ea_dtype)
  1449. # general type inference and conversion
  1450. cvals, na_count = self._infer_types(
  1451. values, set(col_na_values) | col_na_fvalues, try_num_bool
  1452. )
  1453. # type specified in dtype param or cast_type is an EA
  1454. if cast_type and (
  1455. not is_dtype_equal(cvals, cast_type)
  1456. or is_extension_array_dtype(cast_type)
  1457. ):
  1458. try:
  1459. if (
  1460. is_bool_dtype(cast_type)
  1461. and not is_categorical_dtype(cast_type)
  1462. and na_count > 0
  1463. ):
  1464. raise ValueError(f"Bool column has NA values in column {c}")
  1465. except (AttributeError, TypeError):
  1466. # invalid input to is_bool_dtype
  1467. pass
  1468. cvals = self._cast_types(cvals, cast_type, c)
  1469. result[c] = cvals
  1470. if verbose and na_count:
  1471. print(f"Filled {na_count} NA values in column {c!s}")
  1472. return result
  1473. def _infer_types(self, values, na_values, try_num_bool=True):
  1474. """
  1475. Infer types of values, possibly casting
  1476. Parameters
  1477. ----------
  1478. values : ndarray
  1479. na_values : set
  1480. try_num_bool : bool, default try
  1481. try to cast values to numeric (first preference) or boolean
  1482. Returns
  1483. -------
  1484. converted : ndarray
  1485. na_count : int
  1486. """
  1487. na_count = 0
  1488. if issubclass(values.dtype.type, (np.number, np.bool_)):
  1489. mask = algorithms.isin(values, list(na_values))
  1490. na_count = mask.sum()
  1491. if na_count > 0:
  1492. if is_integer_dtype(values):
  1493. values = values.astype(np.float64)
  1494. np.putmask(values, mask, np.nan)
  1495. return values, na_count
  1496. if try_num_bool and is_object_dtype(values.dtype):
  1497. # exclude e.g DatetimeIndex here
  1498. try:
  1499. result = lib.maybe_convert_numeric(values, na_values, False)
  1500. except (ValueError, TypeError):
  1501. # e.g. encountering datetime string gets ValueError
  1502. # TypeError can be raised in floatify
  1503. result = values
  1504. na_count = parsers.sanitize_objects(result, na_values, False)
  1505. else:
  1506. na_count = isna(result).sum()
  1507. else:
  1508. result = values
  1509. if values.dtype == np.object_:
  1510. na_count = parsers.sanitize_objects(values, na_values, False)
  1511. if result.dtype == np.object_ and try_num_bool:
  1512. result = libops.maybe_convert_bool(
  1513. np.asarray(values),
  1514. true_values=self.true_values,
  1515. false_values=self.false_values,
  1516. )
  1517. return result, na_count
  1518. def _cast_types(self, values, cast_type, column):
  1519. """
  1520. Cast values to specified type
  1521. Parameters
  1522. ----------
  1523. values : ndarray
  1524. cast_type : string or np.dtype
  1525. dtype to cast values to
  1526. column : string
  1527. column name - used only for error reporting
  1528. Returns
  1529. -------
  1530. converted : ndarray
  1531. """
  1532. if is_categorical_dtype(cast_type):
  1533. known_cats = (
  1534. isinstance(cast_type, CategoricalDtype)
  1535. and cast_type.categories is not None
  1536. )
  1537. if not is_object_dtype(values) and not known_cats:
  1538. # XXX this is for consistency with
  1539. # c-parser which parses all categories
  1540. # as strings
  1541. values = astype_nansafe(values, str)
  1542. cats = Index(values).unique().dropna()
  1543. values = Categorical._from_inferred_categories(
  1544. cats, cats.get_indexer(values), cast_type, true_values=self.true_values
  1545. )
  1546. # use the EA's implementation of casting
  1547. elif is_extension_array_dtype(cast_type):
  1548. # ensure cast_type is an actual dtype and not a string
  1549. cast_type = pandas_dtype(cast_type)
  1550. array_type = cast_type.construct_array_type()
  1551. try:
  1552. return array_type._from_sequence_of_strings(values, dtype=cast_type)
  1553. except NotImplementedError:
  1554. raise NotImplementedError(
  1555. f"Extension Array: {array_type} must implement "
  1556. "_from_sequence_of_strings in order "
  1557. "to be used in parser methods"
  1558. )
  1559. else:
  1560. try:
  1561. values = astype_nansafe(values, cast_type, copy=True, skipna=True)
  1562. except ValueError:
  1563. raise ValueError(
  1564. f"Unable to convert column {column} to type {cast_type}"
  1565. )
  1566. return values
  1567. def _do_date_conversions(self, names, data):
  1568. # returns data, columns
  1569. if self.parse_dates is not None:
  1570. data, names = _process_date_conversion(
  1571. data,
  1572. self._date_conv,
  1573. self.parse_dates,
  1574. self.index_col,
  1575. self.index_names,
  1576. names,
  1577. keep_date_col=self.keep_date_col,
  1578. )
  1579. return names, data
  1580. class CParserWrapper(ParserBase):
  1581. """
  1582. """
  1583. def __init__(self, src, **kwds):
  1584. self.kwds = kwds
  1585. kwds = kwds.copy()
  1586. ParserBase.__init__(self, kwds)
  1587. encoding = kwds.get("encoding")
  1588. if kwds.get("compression") is None and encoding:
  1589. if isinstance(src, str):
  1590. src = open(src, "rb")
  1591. self.handles.append(src)
  1592. # Handle the file object with universal line mode enabled.
  1593. # We will handle the newline character ourselves later on.
  1594. if hasattr(src, "read") and not hasattr(src, "encoding"):
  1595. src = TextIOWrapper(src, encoding=encoding, newline="")
  1596. kwds["encoding"] = "utf-8"
  1597. # #2442
  1598. kwds["allow_leading_cols"] = self.index_col is not False
  1599. # GH20529, validate usecol arg before TextReader
  1600. self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
  1601. kwds["usecols"] = self.usecols
  1602. self._reader = parsers.TextReader(src, **kwds)
  1603. self.unnamed_cols = self._reader.unnamed_cols
  1604. passed_names = self.names is None
  1605. if self._reader.header is None:
  1606. self.names = None
  1607. else:
  1608. if len(self._reader.header) > 1:
  1609. # we have a multi index in the columns
  1610. (
  1611. self.names,
  1612. self.index_names,
  1613. self.col_names,
  1614. passed_names,
  1615. ) = self._extract_multi_indexer_columns(
  1616. self._reader.header, self.index_names, self.col_names, passed_names
  1617. )
  1618. else:
  1619. self.names = list(self._reader.header[0])
  1620. if self.names is None:
  1621. if self.prefix:
  1622. self.names = [
  1623. f"{self.prefix}{i}" for i in range(self._reader.table_width)
  1624. ]
  1625. else:
  1626. self.names = list(range(self._reader.table_width))
  1627. # gh-9755
  1628. #
  1629. # need to set orig_names here first
  1630. # so that proper indexing can be done
  1631. # with _set_noconvert_columns
  1632. #
  1633. # once names has been filtered, we will
  1634. # then set orig_names again to names
  1635. self.orig_names = self.names[:]
  1636. if self.usecols:
  1637. usecols = _evaluate_usecols(self.usecols, self.orig_names)
  1638. # GH 14671
  1639. if self.usecols_dtype == "string" and not set(usecols).issubset(
  1640. self.orig_names
  1641. ):
  1642. _validate_usecols_names(usecols, self.orig_names)
  1643. if len(self.names) > len(usecols):
  1644. self.names = [
  1645. n
  1646. for i, n in enumerate(self.names)
  1647. if (i in usecols or n in usecols)
  1648. ]
  1649. if len(self.names) < len(usecols):
  1650. _validate_usecols_names(usecols, self.names)
  1651. self._set_noconvert_columns()
  1652. self.orig_names = self.names
  1653. if not self._has_complex_date_col:
  1654. if self._reader.leading_cols == 0 and _is_index_col(self.index_col):
  1655. self._name_processed = True
  1656. (index_names, self.names, self.index_col) = _clean_index_names(
  1657. self.names, self.index_col, self.unnamed_cols
  1658. )
  1659. if self.index_names is None:
  1660. self.index_names = index_names
  1661. if self._reader.header is None and not passed_names:
  1662. self.index_names = [None] * len(self.index_names)
  1663. self._implicit_index = self._reader.leading_cols > 0
  1664. def close(self):
  1665. for f in self.handles:
  1666. f.close()
  1667. # close additional handles opened by C parser (for compression)
  1668. try:
  1669. self._reader.close()
  1670. except ValueError:
  1671. pass
  1672. def _set_noconvert_columns(self):
  1673. """
  1674. Set the columns that should not undergo dtype conversions.
  1675. Currently, any column that is involved with date parsing will not
  1676. undergo such conversions.
  1677. """
  1678. names = self.orig_names
  1679. if self.usecols_dtype == "integer":
  1680. # A set of integers will be converted to a list in
  1681. # the correct order every single time.
  1682. usecols = list(self.usecols)
  1683. usecols.sort()
  1684. elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
  1685. # The names attribute should have the correct columns
  1686. # in the proper order for indexing with parse_dates.
  1687. usecols = self.names[:]
  1688. else:
  1689. # Usecols is empty.
  1690. usecols = None
  1691. def _set(x):
  1692. if usecols is not None and is_integer(x):
  1693. x = usecols[x]
  1694. if not is_integer(x):
  1695. x = names.index(x)
  1696. self._reader.set_noconvert(x)
  1697. if isinstance(self.parse_dates, list):
  1698. for val in self.parse_dates:
  1699. if isinstance(val, list):
  1700. for k in val:
  1701. _set(k)
  1702. else:
  1703. _set(val)
  1704. elif isinstance(self.parse_dates, dict):
  1705. for val in self.parse_dates.values():
  1706. if isinstance(val, list):
  1707. for k in val:
  1708. _set(k)
  1709. else:
  1710. _set(val)
  1711. elif self.parse_dates:
  1712. if isinstance(self.index_col, list):
  1713. for k in self.index_col:
  1714. _set(k)
  1715. elif self.index_col is not None:
  1716. _set(self.index_col)
  1717. def set_error_bad_lines(self, status):
  1718. self._reader.set_error_bad_lines(int(status))
  1719. def read(self, nrows=None):
  1720. try:
  1721. data = self._reader.read(nrows)
  1722. except StopIteration:
  1723. if self._first_chunk:
  1724. self._first_chunk = False
  1725. names = self._maybe_dedup_names(self.orig_names)
  1726. index, columns, col_dict = _get_empty_meta(
  1727. names,
  1728. self.index_col,
  1729. self.index_names,
  1730. dtype=self.kwds.get("dtype"),
  1731. )
  1732. columns = self._maybe_make_multi_index_columns(columns, self.col_names)
  1733. if self.usecols is not None:
  1734. columns = self._filter_usecols(columns)
  1735. col_dict = dict(
  1736. filter(lambda item: item[0] in columns, col_dict.items())
  1737. )
  1738. return index, columns, col_dict
  1739. else:
  1740. raise
  1741. # Done with first read, next time raise StopIteration
  1742. self._first_chunk = False
  1743. names = self.names
  1744. if self._reader.leading_cols:
  1745. if self._has_complex_date_col:
  1746. raise NotImplementedError("file structure not yet supported")
  1747. # implicit index, no index names
  1748. arrays = []
  1749. for i in range(self._reader.leading_cols):
  1750. if self.index_col is None:
  1751. values = data.pop(i)
  1752. else:
  1753. values = data.pop(self.index_col[i])
  1754. values = self._maybe_parse_dates(values, i, try_parse_dates=True)
  1755. arrays.append(values)
  1756. index = ensure_index_from_sequences(arrays)
  1757. if self.usecols is not None:
  1758. names = self._filter_usecols(names)
  1759. names = self._maybe_dedup_names(names)
  1760. # rename dict keys
  1761. data = sorted(data.items())
  1762. data = {k: v for k, (i, v) in zip(names, data)}
  1763. names, data = self._do_date_conversions(names, data)
  1764. else:
  1765. # rename dict keys
  1766. data = sorted(data.items())
  1767. # ugh, mutation
  1768. names = list(self.orig_names)
  1769. names = self._maybe_dedup_names(names)
  1770. if self.usecols is not None:
  1771. names = self._filter_usecols(names)
  1772. # columns as list
  1773. alldata = [x[1] for x in data]
  1774. data = {k: v for k, (i, v) in zip(names, data)}
  1775. names, data = self._do_date_conversions(names, data)
  1776. index, names = self._make_index(data, alldata, names)
  1777. # maybe create a mi on the columns
  1778. names = self._maybe_make_multi_index_columns(names, self.col_names)
  1779. return index, names, data
  1780. def _filter_usecols(self, names):
  1781. # hackish
  1782. usecols = _evaluate_usecols(self.usecols, names)
  1783. if usecols is not None and len(names) != len(usecols):
  1784. names = [
  1785. name for i, name in enumerate(names) if i in usecols or name in usecols
  1786. ]
  1787. return names
  1788. def _get_index_names(self):
  1789. names = list(self._reader.header[0])
  1790. idx_names = None
  1791. if self._reader.leading_cols == 0 and self.index_col is not None:
  1792. (idx_names, names, self.index_col) = _clean_index_names(
  1793. names, self.index_col, self.unnamed_cols
  1794. )
  1795. return names, idx_names
  1796. def _maybe_parse_dates(self, values, index, try_parse_dates=True):
  1797. if try_parse_dates and self._should_parse_dates(index):
  1798. values = self._date_conv(values)
  1799. return values
  1800. def TextParser(*args, **kwds):
  1801. """
  1802. Converts lists of lists/tuples into DataFrames with proper type inference
  1803. and optional (e.g. string to datetime) conversion. Also enables iterating
  1804. lazily over chunks of large files
  1805. Parameters
  1806. ----------
  1807. data : file-like object or list
  1808. delimiter : separator character to use
  1809. dialect : str or csv.Dialect instance, optional
  1810. Ignored if delimiter is longer than 1 character
  1811. names : sequence, default
  1812. header : int, default 0
  1813. Row to use to parse column labels. Defaults to the first row. Prior
  1814. rows will be discarded
  1815. index_col : int or list, optional
  1816. Column or columns to use as the (possibly hierarchical) index
  1817. has_index_names: bool, default False
  1818. True if the cols defined in index_col have an index name and are
  1819. not in the header.
  1820. na_values : scalar, str, list-like, or dict, optional
  1821. Additional strings to recognize as NA/NaN.
  1822. keep_default_na : bool, default True
  1823. thousands : str, optional
  1824. Thousands separator
  1825. comment : str, optional
  1826. Comment out remainder of line
  1827. parse_dates : bool, default False
  1828. keep_date_col : bool, default False
  1829. date_parser : function, optional
  1830. skiprows : list of integers
  1831. Row numbers to skip
  1832. skipfooter : int
  1833. Number of line at bottom of file to skip
  1834. converters : dict, optional
  1835. Dict of functions for converting values in certain columns. Keys can
  1836. either be integers or column labels, values are functions that take one
  1837. input argument, the cell (not column) content, and return the
  1838. transformed content.
  1839. encoding : str, optional
  1840. Encoding to use for UTF when reading/writing (ex. 'utf-8')
  1841. squeeze : bool, default False
  1842. returns Series if only one column.
  1843. infer_datetime_format: bool, default False
  1844. If True and `parse_dates` is True for a column, try to infer the
  1845. datetime format based on the first datetime string. If the format
  1846. can be inferred, there often will be a large parsing speed-up.
  1847. float_precision : str, optional
  1848. Specifies which converter the C engine should use for floating-point
  1849. values. The options are None for the ordinary converter,
  1850. 'high' for the high-precision converter, and 'round_trip' for the
  1851. round-trip converter.
  1852. """
  1853. kwds["engine"] = "python"
  1854. return TextFileReader(*args, **kwds)
  1855. def count_empty_vals(vals):
  1856. return sum(1 for v in vals if v == "" or v is None)
  1857. class PythonParser(ParserBase):
  1858. def __init__(self, f, **kwds):
  1859. """
  1860. Workhorse function for processing nested list into DataFrame
  1861. """
  1862. ParserBase.__init__(self, kwds)
  1863. self.data = None
  1864. self.buf = []
  1865. self.pos = 0
  1866. self.line_pos = 0
  1867. self.encoding = kwds["encoding"]
  1868. self.compression = kwds["compression"]
  1869. self.memory_map = kwds["memory_map"]
  1870. self.skiprows = kwds["skiprows"]
  1871. if callable(self.skiprows):
  1872. self.skipfunc = self.skiprows
  1873. else:
  1874. self.skipfunc = lambda x: x in self.skiprows
  1875. self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"])
  1876. self.delimiter = kwds["delimiter"]
  1877. self.quotechar = kwds["quotechar"]
  1878. if isinstance(self.quotechar, str):
  1879. self.quotechar = str(self.quotechar)
  1880. self.escapechar = kwds["escapechar"]
  1881. self.doublequote = kwds["doublequote"]
  1882. self.skipinitialspace = kwds["skipinitialspace"]
  1883. self.lineterminator = kwds["lineterminator"]
  1884. self.quoting = kwds["quoting"]
  1885. self.usecols, _ = _validate_usecols_arg(kwds["usecols"])
  1886. self.skip_blank_lines = kwds["skip_blank_lines"]
  1887. self.warn_bad_lines = kwds["warn_bad_lines"]
  1888. self.error_bad_lines = kwds["error_bad_lines"]
  1889. self.names_passed = kwds["names"] or None
  1890. self.has_index_names = False
  1891. if "has_index_names" in kwds:
  1892. self.has_index_names = kwds["has_index_names"]
  1893. self.verbose = kwds["verbose"]
  1894. self.converters = kwds["converters"]
  1895. self.dtype = kwds["dtype"]
  1896. self.thousands = kwds["thousands"]
  1897. self.decimal = kwds["decimal"]
  1898. self.comment = kwds["comment"]
  1899. self._comment_lines = []
  1900. f, handles = get_handle(
  1901. f,
  1902. "r",
  1903. encoding=self.encoding,
  1904. compression=self.compression,
  1905. memory_map=self.memory_map,
  1906. )
  1907. self.handles.extend(handles)
  1908. # Set self.data to something that can read lines.
  1909. if hasattr(f, "readline"):
  1910. self._make_reader(f)
  1911. else:
  1912. self.data = f
  1913. # Get columns in two steps: infer from data, then
  1914. # infer column indices from self.usecols if it is specified.
  1915. self._col_indices = None
  1916. (
  1917. self.columns,
  1918. self.num_original_columns,
  1919. self.unnamed_cols,
  1920. ) = self._infer_columns()
  1921. # Now self.columns has the set of columns that we will process.
  1922. # The original set is stored in self.original_columns.
  1923. if len(self.columns) > 1:
  1924. # we are processing a multi index column
  1925. (
  1926. self.columns,
  1927. self.index_names,
  1928. self.col_names,
  1929. _,
  1930. ) = self._extract_multi_indexer_columns(
  1931. self.columns, self.index_names, self.col_names
  1932. )
  1933. # Update list of original names to include all indices.
  1934. self.num_original_columns = len(self.columns)
  1935. else:
  1936. self.columns = self.columns[0]
  1937. # get popped off for index
  1938. self.orig_names = list(self.columns)
  1939. # needs to be cleaned/refactored
  1940. # multiple date column thing turning into a real spaghetti factory
  1941. if not self._has_complex_date_col:
  1942. (index_names, self.orig_names, self.columns) = self._get_index_name(
  1943. self.columns
  1944. )
  1945. self._name_processed = True
  1946. if self.index_names is None:
  1947. self.index_names = index_names
  1948. if self.parse_dates:
  1949. self._no_thousands_columns = self._set_no_thousands_columns()
  1950. else:
  1951. self._no_thousands_columns = None
  1952. if len(self.decimal) != 1:
  1953. raise ValueError("Only length-1 decimal markers supported")
  1954. if self.thousands is None:
  1955. self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+")
  1956. else:
  1957. self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+")
  1958. def _set_no_thousands_columns(self):
  1959. # Create a set of column ids that are not to be stripped of thousands
  1960. # operators.
  1961. noconvert_columns = set()
  1962. def _set(x):
  1963. if is_integer(x):
  1964. noconvert_columns.add(x)
  1965. else:
  1966. noconvert_columns.add(self.columns.index(x))
  1967. if isinstance(self.parse_dates, list):
  1968. for val in self.parse_dates:
  1969. if isinstance(val, list):
  1970. for k in val:
  1971. _set(k)
  1972. else:
  1973. _set(val)
  1974. elif isinstance(self.parse_dates, dict):
  1975. for val in self.parse_dates.values():
  1976. if isinstance(val, list):
  1977. for k in val:
  1978. _set(k)
  1979. else:
  1980. _set(val)
  1981. elif self.parse_dates:
  1982. if isinstance(self.index_col, list):
  1983. for k in self.index_col:
  1984. _set(k)
  1985. elif self.index_col is not None:
  1986. _set(self.index_col)
  1987. return noconvert_columns
  1988. def _make_reader(self, f):
  1989. sep = self.delimiter
  1990. if sep is None or len(sep) == 1:
  1991. if self.lineterminator:
  1992. raise ValueError(
  1993. "Custom line terminators not supported in python parser (yet)"
  1994. )
  1995. class MyDialect(csv.Dialect):
  1996. delimiter = self.delimiter
  1997. quotechar = self.quotechar
  1998. escapechar = self.escapechar
  1999. doublequote = self.doublequote
  2000. skipinitialspace = self.skipinitialspace
  2001. quoting = self.quoting
  2002. lineterminator = "\n"
  2003. dia = MyDialect
  2004. sniff_sep = True
  2005. if sep is not None:
  2006. sniff_sep = False
  2007. dia.delimiter = sep
  2008. # attempt to sniff the delimiter
  2009. if sniff_sep:
  2010. line = f.readline()
  2011. while self.skipfunc(self.pos):
  2012. self.pos += 1
  2013. line = f.readline()
  2014. line = self._check_comments([line])[0]
  2015. self.pos += 1
  2016. self.line_pos += 1
  2017. sniffed = csv.Sniffer().sniff(line)
  2018. dia.delimiter = sniffed.delimiter
  2019. # Note: self.encoding is irrelevant here
  2020. line_rdr = csv.reader(StringIO(line), dialect=dia)
  2021. self.buf.extend(list(line_rdr))
  2022. # Note: self.encoding is irrelevant here
  2023. reader = csv.reader(f, dialect=dia, strict=True)
  2024. else:
  2025. def _read():
  2026. line = f.readline()
  2027. pat = re.compile(sep)
  2028. yield pat.split(line.strip())
  2029. for line in f:
  2030. yield pat.split(line.strip())
  2031. reader = _read()
  2032. self.data = reader
  2033. def read(self, rows=None):
  2034. try:
  2035. content = self._get_lines(rows)
  2036. except StopIteration:
  2037. if self._first_chunk:
  2038. content = []
  2039. else:
  2040. raise
  2041. # done with first read, next time raise StopIteration
  2042. self._first_chunk = False
  2043. columns = list(self.orig_names)
  2044. if not len(content): # pragma: no cover
  2045. # DataFrame with the right metadata, even though it's length 0
  2046. names = self._maybe_dedup_names(self.orig_names)
  2047. index, columns, col_dict = _get_empty_meta(
  2048. names, self.index_col, self.index_names, self.dtype
  2049. )
  2050. columns = self._maybe_make_multi_index_columns(columns, self.col_names)
  2051. return index, columns, col_dict
  2052. # handle new style for names in index
  2053. count_empty_content_vals = count_empty_vals(content[0])
  2054. indexnamerow = None
  2055. if self.has_index_names and count_empty_content_vals == len(columns):
  2056. indexnamerow = content[0]
  2057. content = content[1:]
  2058. alldata = self._rows_to_cols(content)
  2059. data = self._exclude_implicit_index(alldata)
  2060. columns = self._maybe_dedup_names(self.columns)
  2061. columns, data = self._do_date_conversions(columns, data)
  2062. data = self._convert_data(data)
  2063. index, columns = self._make_index(data, alldata, columns, indexnamerow)
  2064. return index, columns, data
  2065. def _exclude_implicit_index(self, alldata):
  2066. names = self._maybe_dedup_names(self.orig_names)
  2067. if self._implicit_index:
  2068. excl_indices = self.index_col
  2069. data = {}
  2070. offset = 0
  2071. for i, col in enumerate(names):
  2072. while i + offset in excl_indices:
  2073. offset += 1
  2074. data[col] = alldata[i + offset]
  2075. else:
  2076. data = {k: v for k, v in zip(names, alldata)}
  2077. return data
  2078. # legacy
  2079. def get_chunk(self, size=None):
  2080. if size is None:
  2081. size = self.chunksize
  2082. return self.read(rows=size)
  2083. def _convert_data(self, data):
  2084. # apply converters
  2085. def _clean_mapping(mapping):
  2086. "converts col numbers to names"
  2087. clean = {}
  2088. for col, v in mapping.items():
  2089. if isinstance(col, int) and col not in self.orig_names:
  2090. col = self.orig_names[col]
  2091. clean[col] = v
  2092. return clean
  2093. clean_conv = _clean_mapping(self.converters)
  2094. if not isinstance(self.dtype, dict):
  2095. # handles single dtype applied to all columns
  2096. clean_dtypes = self.dtype
  2097. else:
  2098. clean_dtypes = _clean_mapping(self.dtype)
  2099. # Apply NA values.
  2100. clean_na_values = {}
  2101. clean_na_fvalues = {}
  2102. if isinstance(self.na_values, dict):
  2103. for col in self.na_values:
  2104. na_value = self.na_values[col]
  2105. na_fvalue = self.na_fvalues[col]
  2106. if isinstance(col, int) and col not in self.orig_names:
  2107. col = self.orig_names[col]
  2108. clean_na_values[col] = na_value
  2109. clean_na_fvalues[col] = na_fvalue
  2110. else:
  2111. clean_na_values = self.na_values
  2112. clean_na_fvalues = self.na_fvalues
  2113. return self._convert_to_ndarrays(
  2114. data,
  2115. clean_na_values,
  2116. clean_na_fvalues,
  2117. self.verbose,
  2118. clean_conv,
  2119. clean_dtypes,
  2120. )
  2121. def _infer_columns(self):
  2122. names = self.names
  2123. num_original_columns = 0
  2124. clear_buffer = True
  2125. unnamed_cols = set()
  2126. if self.header is not None:
  2127. header = self.header
  2128. if isinstance(header, (list, tuple, np.ndarray)):
  2129. have_mi_columns = len(header) > 1
  2130. # we have a mi columns, so read an extra line
  2131. if have_mi_columns:
  2132. header = list(header) + [header[-1] + 1]
  2133. else:
  2134. have_mi_columns = False
  2135. header = [header]
  2136. columns = []
  2137. for level, hr in enumerate(header):
  2138. try:
  2139. line = self._buffered_line()
  2140. while self.line_pos <= hr:
  2141. line = self._next_line()
  2142. except StopIteration:
  2143. if self.line_pos < hr:
  2144. raise ValueError(
  2145. f"Passed header={hr} but only {self.line_pos + 1} lines in "
  2146. "file"
  2147. )
  2148. # We have an empty file, so check
  2149. # if columns are provided. That will
  2150. # serve as the 'line' for parsing
  2151. if have_mi_columns and hr > 0:
  2152. if clear_buffer:
  2153. self._clear_buffer()
  2154. columns.append([None] * len(columns[-1]))
  2155. return columns, num_original_columns, unnamed_cols
  2156. if not self.names:
  2157. raise EmptyDataError("No columns to parse from file")
  2158. line = self.names[:]
  2159. this_columns = []
  2160. this_unnamed_cols = []
  2161. for i, c in enumerate(line):
  2162. if c == "":
  2163. if have_mi_columns:
  2164. col_name = f"Unnamed: {i}_level_{level}"
  2165. else:
  2166. col_name = f"Unnamed: {i}"
  2167. this_unnamed_cols.append(i)
  2168. this_columns.append(col_name)
  2169. else:
  2170. this_columns.append(c)
  2171. if not have_mi_columns and self.mangle_dupe_cols:
  2172. counts = defaultdict(int)
  2173. for i, col in enumerate(this_columns):
  2174. cur_count = counts[col]
  2175. while cur_count > 0:
  2176. counts[col] = cur_count + 1
  2177. col = f"{col}.{cur_count}"
  2178. cur_count = counts[col]
  2179. this_columns[i] = col
  2180. counts[col] = cur_count + 1
  2181. elif have_mi_columns:
  2182. # if we have grabbed an extra line, but its not in our
  2183. # format so save in the buffer, and create an blank extra
  2184. # line for the rest of the parsing code
  2185. if hr == header[-1]:
  2186. lc = len(this_columns)
  2187. ic = len(self.index_col) if self.index_col is not None else 0
  2188. unnamed_count = len(this_unnamed_cols)
  2189. if lc != unnamed_count and lc - ic > unnamed_count:
  2190. clear_buffer = False
  2191. this_columns = [None] * lc
  2192. self.buf = [self.buf[-1]]
  2193. columns.append(this_columns)
  2194. unnamed_cols.update({this_columns[i] for i in this_unnamed_cols})
  2195. if len(columns) == 1:
  2196. num_original_columns = len(this_columns)
  2197. if clear_buffer:
  2198. self._clear_buffer()
  2199. if names is not None:
  2200. if (self.usecols is not None and len(names) != len(self.usecols)) or (
  2201. self.usecols is None and len(names) != len(columns[0])
  2202. ):
  2203. raise ValueError(
  2204. "Number of passed names did not match "
  2205. "number of header fields in the file"
  2206. )
  2207. if len(columns) > 1:
  2208. raise TypeError("Cannot pass names with multi-index columns")
  2209. if self.usecols is not None:
  2210. # Set _use_cols. We don't store columns because they are
  2211. # overwritten.
  2212. self._handle_usecols(columns, names)
  2213. else:
  2214. self._col_indices = None
  2215. num_original_columns = len(names)
  2216. columns = [names]
  2217. else:
  2218. columns = self._handle_usecols(columns, columns[0])
  2219. else:
  2220. try:
  2221. line = self._buffered_line()
  2222. except StopIteration:
  2223. if not names:
  2224. raise EmptyDataError("No columns to parse from file")
  2225. line = names[:]
  2226. ncols = len(line)
  2227. num_original_columns = ncols
  2228. if not names:
  2229. if self.prefix:
  2230. columns = [[f"{self.prefix}{i}" for i in range(ncols)]]
  2231. else:
  2232. columns = [list(range(ncols))]
  2233. columns = self._handle_usecols(columns, columns[0])
  2234. else:
  2235. if self.usecols is None or len(names) >= num_original_columns:
  2236. columns = self._handle_usecols([names], names)
  2237. num_original_columns = len(names)
  2238. else:
  2239. if not callable(self.usecols) and len(names) != len(self.usecols):
  2240. raise ValueError(
  2241. "Number of passed names did not match number of "
  2242. "header fields in the file"
  2243. )
  2244. # Ignore output but set used columns.
  2245. self._handle_usecols([names], names)
  2246. columns = [names]
  2247. num_original_columns = ncols
  2248. return columns, num_original_columns, unnamed_cols
  2249. def _handle_usecols(self, columns, usecols_key):
  2250. """
  2251. Sets self._col_indices
  2252. usecols_key is used if there are string usecols.
  2253. """
  2254. if self.usecols is not None:
  2255. if callable(self.usecols):
  2256. col_indices = _evaluate_usecols(self.usecols, usecols_key)
  2257. elif any(isinstance(u, str) for u in self.usecols):
  2258. if len(columns) > 1:
  2259. raise ValueError(
  2260. "If using multiple headers, usecols must be integers."
  2261. )
  2262. col_indices = []
  2263. for col in self.usecols:
  2264. if isinstance(col, str):
  2265. try:
  2266. col_indices.append(usecols_key.index(col))
  2267. except ValueError:
  2268. _validate_usecols_names(self.usecols, usecols_key)
  2269. else:
  2270. col_indices.append(col)
  2271. else:
  2272. col_indices = self.usecols
  2273. columns = [
  2274. [n for i, n in enumerate(column) if i in col_indices]
  2275. for column in columns
  2276. ]
  2277. self._col_indices = col_indices
  2278. return columns
  2279. def _buffered_line(self):
  2280. """
  2281. Return a line from buffer, filling buffer if required.
  2282. """
  2283. if len(self.buf) > 0:
  2284. return self.buf[0]
  2285. else:
  2286. return self._next_line()
  2287. def _check_for_bom(self, first_row):
  2288. """
  2289. Checks whether the file begins with the BOM character.
  2290. If it does, remove it. In addition, if there is quoting
  2291. in the field subsequent to the BOM, remove it as well
  2292. because it technically takes place at the beginning of
  2293. the name, not the middle of it.
  2294. """
  2295. # first_row will be a list, so we need to check
  2296. # that that list is not empty before proceeding.
  2297. if not first_row:
  2298. return first_row
  2299. # The first element of this row is the one that could have the
  2300. # BOM that we want to remove. Check that the first element is a
  2301. # string before proceeding.
  2302. if not isinstance(first_row[0], str):
  2303. return first_row
  2304. # Check that the string is not empty, as that would
  2305. # obviously not have a BOM at the start of it.
  2306. if not first_row[0]:
  2307. return first_row
  2308. # Since the string is non-empty, check that it does
  2309. # in fact begin with a BOM.
  2310. first_elt = first_row[0][0]
  2311. if first_elt != _BOM:
  2312. return first_row
  2313. first_row_bom = first_row[0]
  2314. if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
  2315. start = 2
  2316. quote = first_row_bom[1]
  2317. end = first_row_bom[2:].index(quote) + 2
  2318. # Extract the data between the quotation marks
  2319. new_row = first_row_bom[start:end]
  2320. # Extract any remaining data after the second
  2321. # quotation mark.
  2322. if len(first_row_bom) > end + 1:
  2323. new_row += first_row_bom[end + 1 :]
  2324. return [new_row] + first_row[1:]
  2325. elif len(first_row_bom) > 1:
  2326. return [first_row_bom[1:]]
  2327. else:
  2328. # First row is just the BOM, so we
  2329. # return an empty string.
  2330. return [""]
  2331. def _is_line_empty(self, line):
  2332. """
  2333. Check if a line is empty or not.
  2334. Parameters
  2335. ----------
  2336. line : str, array-like
  2337. The line of data to check.
  2338. Returns
  2339. -------
  2340. boolean : Whether or not the line is empty.
  2341. """
  2342. return not line or all(not x for x in line)
  2343. def _next_line(self):
  2344. if isinstance(self.data, list):
  2345. while self.skipfunc(self.pos):
  2346. self.pos += 1
  2347. while True:
  2348. try:
  2349. line = self._check_comments([self.data[self.pos]])[0]
  2350. self.pos += 1
  2351. # either uncommented or blank to begin with
  2352. if not self.skip_blank_lines and (
  2353. self._is_line_empty(self.data[self.pos - 1]) or line
  2354. ):
  2355. break
  2356. elif self.skip_blank_lines:
  2357. ret = self._remove_empty_lines([line])
  2358. if ret:
  2359. line = ret[0]
  2360. break
  2361. except IndexError:
  2362. raise StopIteration
  2363. else:
  2364. while self.skipfunc(self.pos):
  2365. self.pos += 1
  2366. next(self.data)
  2367. while True:
  2368. orig_line = self._next_iter_line(row_num=self.pos + 1)
  2369. self.pos += 1
  2370. if orig_line is not None:
  2371. line = self._check_comments([orig_line])[0]
  2372. if self.skip_blank_lines:
  2373. ret = self._remove_empty_lines([line])
  2374. if ret:
  2375. line = ret[0]
  2376. break
  2377. elif self._is_line_empty(orig_line) or line:
  2378. break
  2379. # This was the first line of the file,
  2380. # which could contain the BOM at the
  2381. # beginning of it.
  2382. if self.pos == 1:
  2383. line = self._check_for_bom(line)
  2384. self.line_pos += 1
  2385. self.buf.append(line)
  2386. return line
  2387. def _alert_malformed(self, msg, row_num):
  2388. """
  2389. Alert a user about a malformed row.
  2390. If `self.error_bad_lines` is True, the alert will be `ParserError`.
  2391. If `self.warn_bad_lines` is True, the alert will be printed out.
  2392. Parameters
  2393. ----------
  2394. msg : The error message to display.
  2395. row_num : The row number where the parsing error occurred.
  2396. Because this row number is displayed, we 1-index,
  2397. even though we 0-index internally.
  2398. """
  2399. if self.error_bad_lines:
  2400. raise ParserError(msg)
  2401. elif self.warn_bad_lines:
  2402. base = f"Skipping line {row_num}: "
  2403. sys.stderr.write(base + msg + "\n")
  2404. def _next_iter_line(self, row_num):
  2405. """
  2406. Wrapper around iterating through `self.data` (CSV source).
  2407. When a CSV error is raised, we check for specific
  2408. error messages that allow us to customize the
  2409. error message displayed to the user.
  2410. Parameters
  2411. ----------
  2412. row_num : The row number of the line being parsed.
  2413. """
  2414. try:
  2415. return next(self.data)
  2416. except csv.Error as e:
  2417. if self.warn_bad_lines or self.error_bad_lines:
  2418. msg = str(e)
  2419. if "NULL byte" in msg or "line contains NUL" in msg:
  2420. msg = (
  2421. "NULL byte detected. This byte "
  2422. "cannot be processed in Python's "
  2423. "native csv library at the moment, "
  2424. "so please pass in engine='c' instead"
  2425. )
  2426. if self.skipfooter > 0:
  2427. reason = (
  2428. "Error could possibly be due to "
  2429. "parsing errors in the skipped footer rows "
  2430. "(the skipfooter keyword is only applied "
  2431. "after Python's csv library has parsed "
  2432. "all rows)."
  2433. )
  2434. msg += ". " + reason
  2435. self._alert_malformed(msg, row_num)
  2436. return None
  2437. def _check_comments(self, lines):
  2438. if self.comment is None:
  2439. return lines
  2440. ret = []
  2441. for l in lines:
  2442. rl = []
  2443. for x in l:
  2444. if not isinstance(x, str) or self.comment not in x:
  2445. rl.append(x)
  2446. else:
  2447. x = x[: x.find(self.comment)]
  2448. if len(x) > 0:
  2449. rl.append(x)
  2450. break
  2451. ret.append(rl)
  2452. return ret
  2453. def _remove_empty_lines(self, lines):
  2454. """
  2455. Iterate through the lines and remove any that are
  2456. either empty or contain only one whitespace value
  2457. Parameters
  2458. ----------
  2459. lines : array-like
  2460. The array of lines that we are to filter.
  2461. Returns
  2462. -------
  2463. filtered_lines : array-like
  2464. The same array of lines with the "empty" ones removed.
  2465. """
  2466. ret = []
  2467. for l in lines:
  2468. # Remove empty lines and lines with only one whitespace value
  2469. if (
  2470. len(l) > 1
  2471. or len(l) == 1
  2472. and (not isinstance(l[0], str) or l[0].strip())
  2473. ):
  2474. ret.append(l)
  2475. return ret
  2476. def _check_thousands(self, lines):
  2477. if self.thousands is None:
  2478. return lines
  2479. return self._search_replace_num_columns(
  2480. lines=lines, search=self.thousands, replace=""
  2481. )
  2482. def _search_replace_num_columns(self, lines, search, replace):
  2483. ret = []
  2484. for l in lines:
  2485. rl = []
  2486. for i, x in enumerate(l):
  2487. if (
  2488. not isinstance(x, str)
  2489. or search not in x
  2490. or (self._no_thousands_columns and i in self._no_thousands_columns)
  2491. or self.nonnum.search(x.strip())
  2492. ):
  2493. rl.append(x)
  2494. else:
  2495. rl.append(x.replace(search, replace))
  2496. ret.append(rl)
  2497. return ret
  2498. def _check_decimal(self, lines):
  2499. if self.decimal == _parser_defaults["decimal"]:
  2500. return lines
  2501. return self._search_replace_num_columns(
  2502. lines=lines, search=self.decimal, replace="."
  2503. )
  2504. def _clear_buffer(self):
  2505. self.buf = []
  2506. _implicit_index = False
  2507. def _get_index_name(self, columns):
  2508. """
  2509. Try several cases to get lines:
  2510. 0) There are headers on row 0 and row 1 and their
  2511. total summed lengths equals the length of the next line.
  2512. Treat row 0 as columns and row 1 as indices
  2513. 1) Look for implicit index: there are more columns
  2514. on row 1 than row 0. If this is true, assume that row
  2515. 1 lists index columns and row 0 lists normal columns.
  2516. 2) Get index from the columns if it was listed.
  2517. """
  2518. orig_names = list(columns)
  2519. columns = list(columns)
  2520. try:
  2521. line = self._next_line()
  2522. except StopIteration:
  2523. line = None
  2524. try:
  2525. next_line = self._next_line()
  2526. except StopIteration:
  2527. next_line = None
  2528. # implicitly index_col=0 b/c 1 fewer column names
  2529. implicit_first_cols = 0
  2530. if line is not None:
  2531. # leave it 0, #2442
  2532. # Case 1
  2533. if self.index_col is not False:
  2534. implicit_first_cols = len(line) - self.num_original_columns
  2535. # Case 0
  2536. if next_line is not None:
  2537. if len(next_line) == len(line) + self.num_original_columns:
  2538. # column and index names on diff rows
  2539. self.index_col = list(range(len(line)))
  2540. self.buf = self.buf[1:]
  2541. for c in reversed(line):
  2542. columns.insert(0, c)
  2543. # Update list of original names to include all indices.
  2544. orig_names = list(columns)
  2545. self.num_original_columns = len(columns)
  2546. return line, orig_names, columns
  2547. if implicit_first_cols > 0:
  2548. # Case 1
  2549. self._implicit_index = True
  2550. if self.index_col is None:
  2551. self.index_col = list(range(implicit_first_cols))
  2552. index_name = None
  2553. else:
  2554. # Case 2
  2555. (index_name, columns_, self.index_col) = _clean_index_names(
  2556. columns, self.index_col, self.unnamed_cols
  2557. )
  2558. return index_name, orig_names, columns
  2559. def _rows_to_cols(self, content):
  2560. col_len = self.num_original_columns
  2561. if self._implicit_index:
  2562. col_len += len(self.index_col)
  2563. max_len = max(len(row) for row in content)
  2564. # Check that there are no rows with too many
  2565. # elements in their row (rows with too few
  2566. # elements are padded with NaN).
  2567. if max_len > col_len and self.index_col is not False and self.usecols is None:
  2568. footers = self.skipfooter if self.skipfooter else 0
  2569. bad_lines = []
  2570. iter_content = enumerate(content)
  2571. content_len = len(content)
  2572. content = []
  2573. for (i, l) in iter_content:
  2574. actual_len = len(l)
  2575. if actual_len > col_len:
  2576. if self.error_bad_lines or self.warn_bad_lines:
  2577. row_num = self.pos - (content_len - i + footers)
  2578. bad_lines.append((row_num, actual_len))
  2579. if self.error_bad_lines:
  2580. break
  2581. else:
  2582. content.append(l)
  2583. for row_num, actual_len in bad_lines:
  2584. msg = (
  2585. f"Expected {col_len} fields in line {row_num + 1}, saw "
  2586. f"{actual_len}"
  2587. )
  2588. if (
  2589. self.delimiter
  2590. and len(self.delimiter) > 1
  2591. and self.quoting != csv.QUOTE_NONE
  2592. ):
  2593. # see gh-13374
  2594. reason = (
  2595. "Error could possibly be due to quotes being "
  2596. "ignored when a multi-char delimiter is used."
  2597. )
  2598. msg += ". " + reason
  2599. self._alert_malformed(msg, row_num + 1)
  2600. # see gh-13320
  2601. zipped_content = list(lib.to_object_array(content, min_width=col_len).T)
  2602. if self.usecols:
  2603. if self._implicit_index:
  2604. zipped_content = [
  2605. a
  2606. for i, a in enumerate(zipped_content)
  2607. if (
  2608. i < len(self.index_col)
  2609. or i - len(self.index_col) in self._col_indices
  2610. )
  2611. ]
  2612. else:
  2613. zipped_content = [
  2614. a for i, a in enumerate(zipped_content) if i in self._col_indices
  2615. ]
  2616. return zipped_content
  2617. def _get_lines(self, rows=None):
  2618. lines = self.buf
  2619. new_rows = None
  2620. # already fetched some number
  2621. if rows is not None:
  2622. # we already have the lines in the buffer
  2623. if len(self.buf) >= rows:
  2624. new_rows, self.buf = self.buf[:rows], self.buf[rows:]
  2625. # need some lines
  2626. else:
  2627. rows -= len(self.buf)
  2628. if new_rows is None:
  2629. if isinstance(self.data, list):
  2630. if self.pos > len(self.data):
  2631. raise StopIteration
  2632. if rows is None:
  2633. new_rows = self.data[self.pos :]
  2634. new_pos = len(self.data)
  2635. else:
  2636. new_rows = self.data[self.pos : self.pos + rows]
  2637. new_pos = self.pos + rows
  2638. # Check for stop rows. n.b.: self.skiprows is a set.
  2639. if self.skiprows:
  2640. new_rows = [
  2641. row
  2642. for i, row in enumerate(new_rows)
  2643. if not self.skipfunc(i + self.pos)
  2644. ]
  2645. lines.extend(new_rows)
  2646. self.pos = new_pos
  2647. else:
  2648. new_rows = []
  2649. try:
  2650. if rows is not None:
  2651. for _ in range(rows):
  2652. new_rows.append(next(self.data))
  2653. lines.extend(new_rows)
  2654. else:
  2655. rows = 0
  2656. while True:
  2657. new_row = self._next_iter_line(row_num=self.pos + rows + 1)
  2658. rows += 1
  2659. if new_row is not None:
  2660. new_rows.append(new_row)
  2661. except StopIteration:
  2662. if self.skiprows:
  2663. new_rows = [
  2664. row
  2665. for i, row in enumerate(new_rows)
  2666. if not self.skipfunc(i + self.pos)
  2667. ]
  2668. lines.extend(new_rows)
  2669. if len(lines) == 0:
  2670. raise
  2671. self.pos += len(new_rows)
  2672. self.buf = []
  2673. else:
  2674. lines = new_rows
  2675. if self.skipfooter:
  2676. lines = lines[: -self.skipfooter]
  2677. lines = self._check_comments(lines)
  2678. if self.skip_blank_lines:
  2679. lines = self._remove_empty_lines(lines)
  2680. lines = self._check_thousands(lines)
  2681. return self._check_decimal(lines)
  2682. def _make_date_converter(
  2683. date_parser=None, dayfirst=False, infer_datetime_format=False, cache_dates=True
  2684. ):
  2685. def converter(*date_cols):
  2686. if date_parser is None:
  2687. strs = parsing._concat_date_cols(date_cols)
  2688. try:
  2689. return tools.to_datetime(
  2690. ensure_object(strs),
  2691. utc=None,
  2692. dayfirst=dayfirst,
  2693. errors="ignore",
  2694. infer_datetime_format=infer_datetime_format,
  2695. cache=cache_dates,
  2696. ).to_numpy()
  2697. except ValueError:
  2698. return tools.to_datetime(
  2699. parsing.try_parse_dates(strs, dayfirst=dayfirst), cache=cache_dates
  2700. )
  2701. else:
  2702. try:
  2703. result = tools.to_datetime(
  2704. date_parser(*date_cols), errors="ignore", cache=cache_dates
  2705. )
  2706. if isinstance(result, datetime.datetime):
  2707. raise Exception("scalar parser")
  2708. return result
  2709. except Exception:
  2710. try:
  2711. return tools.to_datetime(
  2712. parsing.try_parse_dates(
  2713. parsing._concat_date_cols(date_cols),
  2714. parser=date_parser,
  2715. dayfirst=dayfirst,
  2716. ),
  2717. errors="ignore",
  2718. )
  2719. except Exception:
  2720. return generic_parser(date_parser, *date_cols)
  2721. return converter
  2722. def _process_date_conversion(
  2723. data_dict,
  2724. converter,
  2725. parse_spec,
  2726. index_col,
  2727. index_names,
  2728. columns,
  2729. keep_date_col=False,
  2730. ):
  2731. def _isindex(colspec):
  2732. return (isinstance(index_col, list) and colspec in index_col) or (
  2733. isinstance(index_names, list) and colspec in index_names
  2734. )
  2735. new_cols = []
  2736. new_data = {}
  2737. orig_names = columns
  2738. columns = list(columns)
  2739. date_cols = set()
  2740. if parse_spec is None or isinstance(parse_spec, bool):
  2741. return data_dict, columns
  2742. if isinstance(parse_spec, list):
  2743. # list of column lists
  2744. for colspec in parse_spec:
  2745. if is_scalar(colspec):
  2746. if isinstance(colspec, int) and colspec not in data_dict:
  2747. colspec = orig_names[colspec]
  2748. if _isindex(colspec):
  2749. continue
  2750. data_dict[colspec] = converter(data_dict[colspec])
  2751. else:
  2752. new_name, col, old_names = _try_convert_dates(
  2753. converter, colspec, data_dict, orig_names
  2754. )
  2755. if new_name in data_dict:
  2756. raise ValueError(f"New date column already in dict {new_name}")
  2757. new_data[new_name] = col
  2758. new_cols.append(new_name)
  2759. date_cols.update(old_names)
  2760. elif isinstance(parse_spec, dict):
  2761. # dict of new name to column list
  2762. for new_name, colspec in parse_spec.items():
  2763. if new_name in data_dict:
  2764. raise ValueError(f"Date column {new_name} already in dict")
  2765. _, col, old_names = _try_convert_dates(
  2766. converter, colspec, data_dict, orig_names
  2767. )
  2768. new_data[new_name] = col
  2769. new_cols.append(new_name)
  2770. date_cols.update(old_names)
  2771. data_dict.update(new_data)
  2772. new_cols.extend(columns)
  2773. if not keep_date_col:
  2774. for c in list(date_cols):
  2775. data_dict.pop(c)
  2776. new_cols.remove(c)
  2777. return data_dict, new_cols
  2778. def _try_convert_dates(parser, colspec, data_dict, columns):
  2779. colset = set(columns)
  2780. colnames = []
  2781. for c in colspec:
  2782. if c in colset:
  2783. colnames.append(c)
  2784. elif isinstance(c, int) and c not in columns:
  2785. colnames.append(columns[c])
  2786. else:
  2787. colnames.append(c)
  2788. new_name = "_".join(str(x) for x in colnames)
  2789. to_parse = [data_dict[c] for c in colnames if c in data_dict]
  2790. new_col = parser(*to_parse)
  2791. return new_name, new_col, colnames
  2792. def _clean_na_values(na_values, keep_default_na=True):
  2793. if na_values is None:
  2794. if keep_default_na:
  2795. na_values = STR_NA_VALUES
  2796. else:
  2797. na_values = set()
  2798. na_fvalues = set()
  2799. elif isinstance(na_values, dict):
  2800. old_na_values = na_values.copy()
  2801. na_values = {} # Prevent aliasing.
  2802. # Convert the values in the na_values dictionary
  2803. # into array-likes for further use. This is also
  2804. # where we append the default NaN values, provided
  2805. # that `keep_default_na=True`.
  2806. for k, v in old_na_values.items():
  2807. if not is_list_like(v):
  2808. v = [v]
  2809. if keep_default_na:
  2810. v = set(v) | STR_NA_VALUES
  2811. na_values[k] = v
  2812. na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
  2813. else:
  2814. if not is_list_like(na_values):
  2815. na_values = [na_values]
  2816. na_values = _stringify_na_values(na_values)
  2817. if keep_default_na:
  2818. na_values = na_values | STR_NA_VALUES
  2819. na_fvalues = _floatify_na_values(na_values)
  2820. return na_values, na_fvalues
  2821. def _clean_index_names(columns, index_col, unnamed_cols):
  2822. if not _is_index_col(index_col):
  2823. return None, columns, index_col
  2824. columns = list(columns)
  2825. cp_cols = list(columns)
  2826. index_names = []
  2827. # don't mutate
  2828. index_col = list(index_col)
  2829. for i, c in enumerate(index_col):
  2830. if isinstance(c, str):
  2831. index_names.append(c)
  2832. for j, name in enumerate(cp_cols):
  2833. if name == c:
  2834. index_col[i] = j
  2835. columns.remove(name)
  2836. break
  2837. else:
  2838. name = cp_cols[c]
  2839. columns.remove(name)
  2840. index_names.append(name)
  2841. # Only clean index names that were placeholders.
  2842. for i, name in enumerate(index_names):
  2843. if isinstance(name, str) and name in unnamed_cols:
  2844. index_names[i] = None
  2845. return index_names, columns, index_col
  2846. def _get_empty_meta(columns, index_col, index_names, dtype=None):
  2847. columns = list(columns)
  2848. # Convert `dtype` to a defaultdict of some kind.
  2849. # This will enable us to write `dtype[col_name]`
  2850. # without worrying about KeyError issues later on.
  2851. if not isinstance(dtype, dict):
  2852. # if dtype == None, default will be np.object.
  2853. default_dtype = dtype or np.object
  2854. dtype = defaultdict(lambda: default_dtype)
  2855. else:
  2856. # Save a copy of the dictionary.
  2857. _dtype = dtype.copy()
  2858. dtype = defaultdict(lambda: np.object)
  2859. # Convert column indexes to column names.
  2860. for k, v in _dtype.items():
  2861. col = columns[k] if is_integer(k) else k
  2862. dtype[col] = v
  2863. # Even though we have no data, the "index" of the empty DataFrame
  2864. # could for example still be an empty MultiIndex. Thus, we need to
  2865. # check whether we have any index columns specified, via either:
  2866. #
  2867. # 1) index_col (column indices)
  2868. # 2) index_names (column names)
  2869. #
  2870. # Both must be non-null to ensure a successful construction. Otherwise,
  2871. # we have to create a generic empty Index.
  2872. if (index_col is None or index_col is False) or index_names is None:
  2873. index = Index([])
  2874. else:
  2875. data = [Series([], dtype=dtype[name]) for name in index_names]
  2876. index = ensure_index_from_sequences(data, names=index_names)
  2877. index_col.sort()
  2878. for i, n in enumerate(index_col):
  2879. columns.pop(n - i)
  2880. col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns}
  2881. return index, columns, col_dict
  2882. def _floatify_na_values(na_values):
  2883. # create float versions of the na_values
  2884. result = set()
  2885. for v in na_values:
  2886. try:
  2887. v = float(v)
  2888. if not np.isnan(v):
  2889. result.add(v)
  2890. except (TypeError, ValueError, OverflowError):
  2891. pass
  2892. return result
  2893. def _stringify_na_values(na_values):
  2894. """ return a stringified and numeric for these values """
  2895. result = []
  2896. for x in na_values:
  2897. result.append(str(x))
  2898. result.append(x)
  2899. try:
  2900. v = float(x)
  2901. # we are like 999 here
  2902. if v == int(v):
  2903. v = int(v)
  2904. result.append(f"{v}.0")
  2905. result.append(str(v))
  2906. result.append(v)
  2907. except (TypeError, ValueError, OverflowError):
  2908. pass
  2909. try:
  2910. result.append(int(x))
  2911. except (TypeError, ValueError, OverflowError):
  2912. pass
  2913. return set(result)
  2914. def _get_na_values(col, na_values, na_fvalues, keep_default_na):
  2915. """
  2916. Get the NaN values for a given column.
  2917. Parameters
  2918. ----------
  2919. col : str
  2920. The name of the column.
  2921. na_values : array-like, dict
  2922. The object listing the NaN values as strings.
  2923. na_fvalues : array-like, dict
  2924. The object listing the NaN values as floats.
  2925. keep_default_na : bool
  2926. If `na_values` is a dict, and the column is not mapped in the
  2927. dictionary, whether to return the default NaN values or the empty set.
  2928. Returns
  2929. -------
  2930. nan_tuple : A length-two tuple composed of
  2931. 1) na_values : the string NaN values for that column.
  2932. 2) na_fvalues : the float NaN values for that column.
  2933. """
  2934. if isinstance(na_values, dict):
  2935. if col in na_values:
  2936. return na_values[col], na_fvalues[col]
  2937. else:
  2938. if keep_default_na:
  2939. return STR_NA_VALUES, set()
  2940. return set(), set()
  2941. else:
  2942. return na_values, na_fvalues
  2943. def _get_col_names(colspec, columns):
  2944. colset = set(columns)
  2945. colnames = []
  2946. for c in colspec:
  2947. if c in colset:
  2948. colnames.append(c)
  2949. elif isinstance(c, int):
  2950. colnames.append(columns[c])
  2951. return colnames
  2952. class FixedWidthReader(abc.Iterator):
  2953. """
  2954. A reader of fixed-width lines.
  2955. """
  2956. def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100):
  2957. self.f = f
  2958. self.buffer = None
  2959. self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
  2960. self.comment = comment
  2961. if colspecs == "infer":
  2962. self.colspecs = self.detect_colspecs(
  2963. infer_nrows=infer_nrows, skiprows=skiprows
  2964. )
  2965. else:
  2966. self.colspecs = colspecs
  2967. if not isinstance(self.colspecs, (tuple, list)):
  2968. raise TypeError(
  2969. "column specifications must be a list or tuple, "
  2970. f"input was a {type(colspecs).__name__}"
  2971. )
  2972. for colspec in self.colspecs:
  2973. if not (
  2974. isinstance(colspec, (tuple, list))
  2975. and len(colspec) == 2
  2976. and isinstance(colspec[0], (int, np.integer, type(None)))
  2977. and isinstance(colspec[1], (int, np.integer, type(None)))
  2978. ):
  2979. raise TypeError(
  2980. "Each column specification must be "
  2981. "2 element tuple or list of integers"
  2982. )
  2983. def get_rows(self, infer_nrows, skiprows=None):
  2984. """
  2985. Read rows from self.f, skipping as specified.
  2986. We distinguish buffer_rows (the first <= infer_nrows
  2987. lines) from the rows returned to detect_colspecs
  2988. because it's simpler to leave the other locations
  2989. with skiprows logic alone than to modify them to
  2990. deal with the fact we skipped some rows here as
  2991. well.
  2992. Parameters
  2993. ----------
  2994. infer_nrows : int
  2995. Number of rows to read from self.f, not counting
  2996. rows that are skipped.
  2997. skiprows: set, optional
  2998. Indices of rows to skip.
  2999. Returns
  3000. -------
  3001. detect_rows : list of str
  3002. A list containing the rows to read.
  3003. """
  3004. if skiprows is None:
  3005. skiprows = set()
  3006. buffer_rows = []
  3007. detect_rows = []
  3008. for i, row in enumerate(self.f):
  3009. if i not in skiprows:
  3010. detect_rows.append(row)
  3011. buffer_rows.append(row)
  3012. if len(detect_rows) >= infer_nrows:
  3013. break
  3014. self.buffer = iter(buffer_rows)
  3015. return detect_rows
  3016. def detect_colspecs(self, infer_nrows=100, skiprows=None):
  3017. # Regex escape the delimiters
  3018. delimiters = "".join(r"\{}".format(x) for x in self.delimiter)
  3019. pattern = re.compile("([^{}]+)".format(delimiters))
  3020. rows = self.get_rows(infer_nrows, skiprows)
  3021. if not rows:
  3022. raise EmptyDataError("No rows from which to infer column width")
  3023. max_len = max(map(len, rows))
  3024. mask = np.zeros(max_len + 1, dtype=int)
  3025. if self.comment is not None:
  3026. rows = [row.partition(self.comment)[0] for row in rows]
  3027. for row in rows:
  3028. for m in pattern.finditer(row):
  3029. mask[m.start() : m.end()] = 1
  3030. shifted = np.roll(mask, 1)
  3031. shifted[0] = 0
  3032. edges = np.where((mask ^ shifted) == 1)[0]
  3033. edge_pairs = list(zip(edges[::2], edges[1::2]))
  3034. return edge_pairs
  3035. def __next__(self):
  3036. if self.buffer is not None:
  3037. try:
  3038. line = next(self.buffer)
  3039. except StopIteration:
  3040. self.buffer = None
  3041. line = next(self.f)
  3042. else:
  3043. line = next(self.f)
  3044. # Note: 'colspecs' is a sequence of half-open intervals.
  3045. return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
  3046. class FixedWidthFieldParser(PythonParser):
  3047. """
  3048. Specialization that Converts fixed-width fields into DataFrames.
  3049. See PythonParser for details.
  3050. """
  3051. def __init__(self, f, **kwds):
  3052. # Support iterators, convert to a list.
  3053. self.colspecs = kwds.pop("colspecs")
  3054. self.infer_nrows = kwds.pop("infer_nrows")
  3055. PythonParser.__init__(self, f, **kwds)
  3056. def _make_reader(self, f):
  3057. self.data = FixedWidthReader(
  3058. f,
  3059. self.colspecs,
  3060. self.delimiter,
  3061. self.comment,
  3062. self.skiprows,
  3063. self.infer_nrows,
  3064. )