pytables.py 159 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123
  1. """
  2. High level interface to PyTables for reading and writing pandas data structures
  3. to disk
  4. """
  5. import copy
  6. from datetime import date, tzinfo
  7. import itertools
  8. import os
  9. import re
  10. from typing import (
  11. TYPE_CHECKING,
  12. Any,
  13. Dict,
  14. Hashable,
  15. List,
  16. Optional,
  17. Tuple,
  18. Type,
  19. Union,
  20. )
  21. import warnings
  22. import numpy as np
  23. from pandas._config import config, get_option
  24. from pandas._libs import lib, writers as libwriters
  25. from pandas._libs.tslibs import timezones
  26. from pandas._typing import ArrayLike, FrameOrSeries
  27. from pandas.compat._optional import import_optional_dependency
  28. from pandas.errors import PerformanceWarning
  29. from pandas.util._decorators import cache_readonly
  30. from pandas.core.dtypes.common import (
  31. ensure_object,
  32. is_categorical_dtype,
  33. is_complex_dtype,
  34. is_datetime64_dtype,
  35. is_datetime64tz_dtype,
  36. is_extension_array_dtype,
  37. is_list_like,
  38. is_string_dtype,
  39. is_timedelta64_dtype,
  40. )
  41. from pandas.core.dtypes.generic import ABCExtensionArray
  42. from pandas.core.dtypes.missing import array_equivalent
  43. from pandas import (
  44. DataFrame,
  45. DatetimeIndex,
  46. Index,
  47. Int64Index,
  48. MultiIndex,
  49. PeriodIndex,
  50. Series,
  51. TimedeltaIndex,
  52. concat,
  53. isna,
  54. )
  55. from pandas.core.arrays.categorical import Categorical
  56. import pandas.core.common as com
  57. from pandas.core.computation.pytables import PyTablesExpr, maybe_expression
  58. from pandas.core.indexes.api import ensure_index
  59. from pandas.io.common import stringify_path
  60. from pandas.io.formats.printing import adjoin, pprint_thing
  61. if TYPE_CHECKING:
  62. from tables import File, Node, Col # noqa:F401
  63. # versioning attribute
  64. _version = "0.15.2"
  65. # encoding
  66. _default_encoding = "UTF-8"
  67. def _ensure_decoded(s):
  68. """ if we have bytes, decode them to unicode """
  69. if isinstance(s, np.bytes_):
  70. s = s.decode("UTF-8")
  71. return s
  72. def _ensure_encoding(encoding):
  73. # set the encoding if we need
  74. if encoding is None:
  75. encoding = _default_encoding
  76. return encoding
  77. def _ensure_str(name):
  78. """
  79. Ensure that an index / column name is a str (python 3); otherwise they
  80. may be np.string dtype. Non-string dtypes are passed through unchanged.
  81. https://github.com/pandas-dev/pandas/issues/13492
  82. """
  83. if isinstance(name, str):
  84. name = str(name)
  85. return name
  86. Term = PyTablesExpr
  87. def _ensure_term(where, scope_level: int):
  88. """
  89. ensure that the where is a Term or a list of Term
  90. this makes sure that we are capturing the scope of variables
  91. that are passed
  92. create the terms here with a frame_level=2 (we are 2 levels down)
  93. """
  94. # only consider list/tuple here as an ndarray is automatically a coordinate
  95. # list
  96. level = scope_level + 1
  97. if isinstance(where, (list, tuple)):
  98. wlist = []
  99. for w in filter(lambda x: x is not None, where):
  100. if not maybe_expression(w):
  101. wlist.append(w)
  102. else:
  103. wlist.append(Term(w, scope_level=level))
  104. where = wlist
  105. elif maybe_expression(where):
  106. where = Term(where, scope_level=level)
  107. return where if where is None or len(where) else None
  108. class PossibleDataLossError(Exception):
  109. pass
  110. class ClosedFileError(Exception):
  111. pass
  112. class IncompatibilityWarning(Warning):
  113. pass
  114. incompatibility_doc = """
  115. where criteria is being ignored as this version [%s] is too old (or
  116. not-defined), read the file in and write it out to a new file to upgrade (with
  117. the copy_to method)
  118. """
  119. class AttributeConflictWarning(Warning):
  120. pass
  121. attribute_conflict_doc = """
  122. the [%s] attribute of the existing index is [%s] which conflicts with the new
  123. [%s], resetting the attribute to None
  124. """
  125. class DuplicateWarning(Warning):
  126. pass
  127. duplicate_doc = """
  128. duplicate entries in table, taking most recently appended
  129. """
  130. performance_doc = """
  131. your performance may suffer as PyTables will pickle object types that it cannot
  132. map directly to c-types [inferred_type->%s,key->%s] [items->%s]
  133. """
  134. # formats
  135. _FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
  136. # axes map
  137. _AXES_MAP = {DataFrame: [0]}
  138. # register our configuration options
  139. dropna_doc = """
  140. : boolean
  141. drop ALL nan rows when appending to a table
  142. """
  143. format_doc = """
  144. : format
  145. default format writing format, if None, then
  146. put will default to 'fixed' and append will default to 'table'
  147. """
  148. with config.config_prefix("io.hdf"):
  149. config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
  150. config.register_option(
  151. "default_format",
  152. None,
  153. format_doc,
  154. validator=config.is_one_of_factory(["fixed", "table", None]),
  155. )
  156. # oh the troubles to reduce import time
  157. _table_mod = None
  158. _table_file_open_policy_is_strict = False
  159. def _tables():
  160. global _table_mod
  161. global _table_file_open_policy_is_strict
  162. if _table_mod is None:
  163. import tables
  164. _table_mod = tables
  165. # set the file open policy
  166. # return the file open policy; this changes as of pytables 3.1
  167. # depending on the HDF5 version
  168. try:
  169. _table_file_open_policy_is_strict = (
  170. tables.file._FILE_OPEN_POLICY == "strict"
  171. )
  172. except AttributeError:
  173. pass
  174. return _table_mod
  175. # interface to/from ###
  176. def to_hdf(
  177. path_or_buf,
  178. key: str,
  179. value: FrameOrSeries,
  180. mode: str = "a",
  181. complevel: Optional[int] = None,
  182. complib: Optional[str] = None,
  183. append: bool = False,
  184. format: Optional[str] = None,
  185. index: bool = True,
  186. min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
  187. nan_rep=None,
  188. dropna: Optional[bool] = None,
  189. data_columns: Optional[List[str]] = None,
  190. errors: str = "strict",
  191. encoding: str = "UTF-8",
  192. ):
  193. """ store this object, close it if we opened it """
  194. if append:
  195. f = lambda store: store.append(
  196. key,
  197. value,
  198. format=format,
  199. index=index,
  200. min_itemsize=min_itemsize,
  201. nan_rep=nan_rep,
  202. dropna=dropna,
  203. data_columns=data_columns,
  204. errors=errors,
  205. encoding=encoding,
  206. )
  207. else:
  208. # NB: dropna is not passed to `put`
  209. f = lambda store: store.put(
  210. key,
  211. value,
  212. format=format,
  213. index=index,
  214. min_itemsize=min_itemsize,
  215. nan_rep=nan_rep,
  216. data_columns=data_columns,
  217. errors=errors,
  218. encoding=encoding,
  219. )
  220. path_or_buf = stringify_path(path_or_buf)
  221. if isinstance(path_or_buf, str):
  222. with HDFStore(
  223. path_or_buf, mode=mode, complevel=complevel, complib=complib
  224. ) as store:
  225. f(store)
  226. else:
  227. f(path_or_buf)
  228. def read_hdf(
  229. path_or_buf,
  230. key=None,
  231. mode: str = "r",
  232. errors: str = "strict",
  233. where=None,
  234. start: Optional[int] = None,
  235. stop: Optional[int] = None,
  236. columns=None,
  237. iterator=False,
  238. chunksize: Optional[int] = None,
  239. **kwargs,
  240. ):
  241. """
  242. Read from the store, close it if we opened it.
  243. Retrieve pandas object stored in file, optionally based on where
  244. criteria
  245. Parameters
  246. ----------
  247. path_or_buf : str, path object, pandas.HDFStore or file-like object
  248. Any valid string path is acceptable. The string could be a URL. Valid
  249. URL schemes include http, ftp, s3, and file. For file URLs, a host is
  250. expected. A local file could be: ``file://localhost/path/to/table.h5``.
  251. If you want to pass in a path object, pandas accepts any
  252. ``os.PathLike``.
  253. Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
  254. By file-like object, we refer to objects with a ``read()`` method,
  255. such as a file handler (e.g. via builtin ``open`` function)
  256. or ``StringIO``.
  257. .. versionadded:: 0.21.0 support for __fspath__ protocol.
  258. key : object, optional
  259. The group identifier in the store. Can be omitted if the HDF file
  260. contains a single pandas object.
  261. mode : {'r', 'r+', 'a'}, default 'r'
  262. Mode to use when opening the file. Ignored if path_or_buf is a
  263. :class:`pandas.HDFStore`. Default is 'r'.
  264. where : list, optional
  265. A list of Term (or convertible) objects.
  266. start : int, optional
  267. Row number to start selection.
  268. stop : int, optional
  269. Row number to stop selection.
  270. columns : list, optional
  271. A list of columns names to return.
  272. iterator : bool, optional
  273. Return an iterator object.
  274. chunksize : int, optional
  275. Number of rows to include in an iteration when using an iterator.
  276. errors : str, default 'strict'
  277. Specifies how encoding and decoding errors are to be handled.
  278. See the errors argument for :func:`open` for a full list
  279. of options.
  280. **kwargs
  281. Additional keyword arguments passed to HDFStore.
  282. Returns
  283. -------
  284. item : object
  285. The selected object. Return type depends on the object stored.
  286. See Also
  287. --------
  288. DataFrame.to_hdf : Write a HDF file from a DataFrame.
  289. HDFStore : Low-level access to HDF files.
  290. Examples
  291. --------
  292. >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
  293. >>> df.to_hdf('./store.h5', 'data')
  294. >>> reread = pd.read_hdf('./store.h5')
  295. """
  296. if mode not in ["r", "r+", "a"]:
  297. raise ValueError(
  298. f"mode {mode} is not allowed while performing a read. "
  299. f"Allowed modes are r, r+ and a."
  300. )
  301. # grab the scope
  302. if where is not None:
  303. where = _ensure_term(where, scope_level=1)
  304. if isinstance(path_or_buf, HDFStore):
  305. if not path_or_buf.is_open:
  306. raise IOError("The HDFStore must be open for reading.")
  307. store = path_or_buf
  308. auto_close = False
  309. else:
  310. path_or_buf = stringify_path(path_or_buf)
  311. if not isinstance(path_or_buf, str):
  312. raise NotImplementedError(
  313. "Support for generic buffers has not been implemented."
  314. )
  315. try:
  316. exists = os.path.exists(path_or_buf)
  317. # if filepath is too long
  318. except (TypeError, ValueError):
  319. exists = False
  320. if not exists:
  321. raise FileNotFoundError(f"File {path_or_buf} does not exist")
  322. store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
  323. # can't auto open/close if we are using an iterator
  324. # so delegate to the iterator
  325. auto_close = True
  326. try:
  327. if key is None:
  328. groups = store.groups()
  329. if len(groups) == 0:
  330. raise ValueError("No dataset in HDF5 file.")
  331. candidate_only_group = groups[0]
  332. # For the HDF file to have only one dataset, all other groups
  333. # should then be metadata groups for that candidate group. (This
  334. # assumes that the groups() method enumerates parent groups
  335. # before their children.)
  336. for group_to_check in groups[1:]:
  337. if not _is_metadata_of(group_to_check, candidate_only_group):
  338. raise ValueError(
  339. "key must be provided when HDF5 file "
  340. "contains multiple datasets."
  341. )
  342. key = candidate_only_group._v_pathname
  343. return store.select(
  344. key,
  345. where=where,
  346. start=start,
  347. stop=stop,
  348. columns=columns,
  349. iterator=iterator,
  350. chunksize=chunksize,
  351. auto_close=auto_close,
  352. )
  353. except (ValueError, TypeError, KeyError):
  354. if not isinstance(path_or_buf, HDFStore):
  355. # if there is an error, close the store if we opened it.
  356. try:
  357. store.close()
  358. except AttributeError:
  359. pass
  360. raise
  361. def _is_metadata_of(group: "Node", parent_group: "Node") -> bool:
  362. """Check if a given group is a metadata group for a given parent_group."""
  363. if group._v_depth <= parent_group._v_depth:
  364. return False
  365. current = group
  366. while current._v_depth > 1:
  367. parent = current._v_parent
  368. if parent == parent_group and current._v_name == "meta":
  369. return True
  370. current = current._v_parent
  371. return False
  372. class HDFStore:
  373. """
  374. Dict-like IO interface for storing pandas objects in PyTables.
  375. Either Fixed or Table format.
  376. Parameters
  377. ----------
  378. path : string
  379. File path to HDF5 file
  380. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  381. ``'r'``
  382. Read-only; no data can be modified.
  383. ``'w'``
  384. Write; a new file is created (an existing file with the same
  385. name would be deleted).
  386. ``'a'``
  387. Append; an existing file is opened for reading and writing,
  388. and if the file does not exist it is created.
  389. ``'r+'``
  390. It is similar to ``'a'``, but the file must already exist.
  391. complevel : int, 0-9, default None
  392. Specifies a compression level for data.
  393. A value of 0 or None disables compression.
  394. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
  395. Specifies the compression library to be used.
  396. As of v0.20.2 these additional compressors for Blosc are supported
  397. (default if no compressor specified: 'blosc:blosclz'):
  398. {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
  399. 'blosc:zlib', 'blosc:zstd'}.
  400. Specifying a compression library which is not available issues
  401. a ValueError.
  402. fletcher32 : bool, default False
  403. If applying compression use the fletcher32 checksum
  404. Examples
  405. --------
  406. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  407. >>> store = pd.HDFStore('test.h5')
  408. >>> store['foo'] = bar # write to HDF5
  409. >>> bar = store['foo'] # retrieve
  410. >>> store.close()
  411. """
  412. _handle: Optional["File"]
  413. _mode: str
  414. _complevel: int
  415. _fletcher32: bool
  416. def __init__(
  417. self,
  418. path,
  419. mode: str = "a",
  420. complevel: Optional[int] = None,
  421. complib=None,
  422. fletcher32: bool = False,
  423. **kwargs,
  424. ):
  425. if "format" in kwargs:
  426. raise ValueError("format is not a defined argument for HDFStore")
  427. tables = import_optional_dependency("tables")
  428. if complib is not None and complib not in tables.filters.all_complibs:
  429. raise ValueError(
  430. f"complib only supports {tables.filters.all_complibs} compression."
  431. )
  432. if complib is None and complevel is not None:
  433. complib = tables.filters.default_complib
  434. self._path = stringify_path(path)
  435. if mode is None:
  436. mode = "a"
  437. self._mode = mode
  438. self._handle = None
  439. self._complevel = complevel if complevel else 0
  440. self._complib = complib
  441. self._fletcher32 = fletcher32
  442. self._filters = None
  443. self.open(mode=mode, **kwargs)
  444. def __fspath__(self):
  445. return self._path
  446. @property
  447. def root(self):
  448. """ return the root node """
  449. self._check_if_open()
  450. return self._handle.root
  451. @property
  452. def filename(self):
  453. return self._path
  454. def __getitem__(self, key: str):
  455. return self.get(key)
  456. def __setitem__(self, key: str, value):
  457. self.put(key, value)
  458. def __delitem__(self, key: str):
  459. return self.remove(key)
  460. def __getattr__(self, name: str):
  461. """ allow attribute access to get stores """
  462. try:
  463. return self.get(name)
  464. except (KeyError, ClosedFileError):
  465. pass
  466. raise AttributeError(
  467. f"'{type(self).__name__}' object has no attribute '{name}'"
  468. )
  469. def __contains__(self, key: str) -> bool:
  470. """ check for existence of this key
  471. can match the exact pathname or the pathnm w/o the leading '/'
  472. """
  473. node = self.get_node(key)
  474. if node is not None:
  475. name = node._v_pathname
  476. if name == key or name[1:] == key:
  477. return True
  478. return False
  479. def __len__(self) -> int:
  480. return len(self.groups())
  481. def __repr__(self) -> str:
  482. pstr = pprint_thing(self._path)
  483. return f"{type(self)}\nFile path: {pstr}\n"
  484. def __enter__(self):
  485. return self
  486. def __exit__(self, exc_type, exc_value, traceback):
  487. self.close()
  488. def keys(self) -> List[str]:
  489. """
  490. Return a list of keys corresponding to objects stored in HDFStore.
  491. Returns
  492. -------
  493. list
  494. List of ABSOLUTE path-names (e.g. have the leading '/').
  495. """
  496. return [n._v_pathname for n in self.groups()]
  497. def __iter__(self):
  498. return iter(self.keys())
  499. def items(self):
  500. """
  501. iterate on key->group
  502. """
  503. for g in self.groups():
  504. yield g._v_pathname, g
  505. iteritems = items
  506. def open(self, mode: str = "a", **kwargs):
  507. """
  508. Open the file in the specified mode
  509. Parameters
  510. ----------
  511. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  512. See HDFStore docstring or tables.open_file for info about modes
  513. """
  514. tables = _tables()
  515. if self._mode != mode:
  516. # if we are changing a write mode to read, ok
  517. if self._mode in ["a", "w"] and mode in ["r", "r+"]:
  518. pass
  519. elif mode in ["w"]:
  520. # this would truncate, raise here
  521. if self.is_open:
  522. raise PossibleDataLossError(
  523. f"Re-opening the file [{self._path}] with mode [{self._mode}] "
  524. "will delete the current file!"
  525. )
  526. self._mode = mode
  527. # close and reopen the handle
  528. if self.is_open:
  529. self.close()
  530. if self._complevel and self._complevel > 0:
  531. self._filters = _tables().Filters(
  532. self._complevel, self._complib, fletcher32=self._fletcher32
  533. )
  534. try:
  535. self._handle = tables.open_file(self._path, self._mode, **kwargs)
  536. except IOError as err: # pragma: no cover
  537. if "can not be written" in str(err):
  538. print(f"Opening {self._path} in read-only mode")
  539. self._handle = tables.open_file(self._path, "r", **kwargs)
  540. else:
  541. raise
  542. except ValueError as err:
  543. # trap PyTables >= 3.1 FILE_OPEN_POLICY exception
  544. # to provide an updated message
  545. if "FILE_OPEN_POLICY" in str(err):
  546. hdf_version = tables.get_hdf5_version()
  547. err = ValueError(
  548. f"PyTables [{tables.__version__}] no longer supports "
  549. "opening multiple files\n"
  550. "even in read-only mode on this HDF5 version "
  551. f"[{hdf_version}]. You can accept this\n"
  552. "and not open the same file multiple times at once,\n"
  553. "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 "
  554. "which allows\n"
  555. "files to be opened multiple times at once\n"
  556. )
  557. raise err
  558. except Exception as err:
  559. # trying to read from a non-existent file causes an error which
  560. # is not part of IOError, make it one
  561. if self._mode == "r" and "Unable to open/create file" in str(err):
  562. raise IOError(str(err))
  563. raise
  564. def close(self):
  565. """
  566. Close the PyTables file handle
  567. """
  568. if self._handle is not None:
  569. self._handle.close()
  570. self._handle = None
  571. @property
  572. def is_open(self) -> bool:
  573. """
  574. return a boolean indicating whether the file is open
  575. """
  576. if self._handle is None:
  577. return False
  578. return bool(self._handle.isopen)
  579. def flush(self, fsync: bool = False):
  580. """
  581. Force all buffered modifications to be written to disk.
  582. Parameters
  583. ----------
  584. fsync : bool (default False)
  585. call ``os.fsync()`` on the file handle to force writing to disk.
  586. Notes
  587. -----
  588. Without ``fsync=True``, flushing may not guarantee that the OS writes
  589. to disk. With fsync, the operation will block until the OS claims the
  590. file has been written; however, other caching layers may still
  591. interfere.
  592. """
  593. if self._handle is not None:
  594. self._handle.flush()
  595. if fsync:
  596. try:
  597. os.fsync(self._handle.fileno())
  598. except OSError:
  599. pass
  600. def get(self, key: str):
  601. """
  602. Retrieve pandas object stored in file.
  603. Parameters
  604. ----------
  605. key : str
  606. Returns
  607. -------
  608. object
  609. Same type as object stored in file.
  610. """
  611. group = self.get_node(key)
  612. if group is None:
  613. raise KeyError(f"No object named {key} in the file")
  614. return self._read_group(group)
  615. def select(
  616. self,
  617. key: str,
  618. where=None,
  619. start=None,
  620. stop=None,
  621. columns=None,
  622. iterator=False,
  623. chunksize=None,
  624. auto_close: bool = False,
  625. ):
  626. """
  627. Retrieve pandas object stored in file, optionally based on where criteria.
  628. Parameters
  629. ----------
  630. key : str
  631. Object being retrieved from file.
  632. where : list, default None
  633. List of Term (or convertible) objects, optional.
  634. start : int, default None
  635. Row number to start selection.
  636. stop : int, default None
  637. Row number to stop selection.
  638. columns : list, default None
  639. A list of columns that if not None, will limit the return columns.
  640. iterator : bool, default False
  641. Returns an iterator.
  642. chunksize : int, default None
  643. Number or rows to include in iteration, return an iterator.
  644. auto_close : bool, default False
  645. Should automatically close the store when finished.
  646. Returns
  647. -------
  648. object
  649. Retrieved object from file.
  650. """
  651. group = self.get_node(key)
  652. if group is None:
  653. raise KeyError(f"No object named {key} in the file")
  654. # create the storer and axes
  655. where = _ensure_term(where, scope_level=1)
  656. s = self._create_storer(group)
  657. s.infer_axes()
  658. # function to call on iteration
  659. def func(_start, _stop, _where):
  660. return s.read(start=_start, stop=_stop, where=_where, columns=columns)
  661. # create the iterator
  662. it = TableIterator(
  663. self,
  664. s,
  665. func,
  666. where=where,
  667. nrows=s.nrows,
  668. start=start,
  669. stop=stop,
  670. iterator=iterator,
  671. chunksize=chunksize,
  672. auto_close=auto_close,
  673. )
  674. return it.get_result()
  675. def select_as_coordinates(
  676. self,
  677. key: str,
  678. where=None,
  679. start: Optional[int] = None,
  680. stop: Optional[int] = None,
  681. ):
  682. """
  683. return the selection as an Index
  684. Parameters
  685. ----------
  686. key : str
  687. where : list of Term (or convertible) objects, optional
  688. start : integer (defaults to None), row number to start selection
  689. stop : integer (defaults to None), row number to stop selection
  690. """
  691. where = _ensure_term(where, scope_level=1)
  692. tbl = self.get_storer(key)
  693. if not isinstance(tbl, Table):
  694. raise TypeError("can only read_coordinates with a table")
  695. return tbl.read_coordinates(where=where, start=start, stop=stop)
  696. def select_column(
  697. self,
  698. key: str,
  699. column: str,
  700. start: Optional[int] = None,
  701. stop: Optional[int] = None,
  702. ):
  703. """
  704. return a single column from the table. This is generally only useful to
  705. select an indexable
  706. Parameters
  707. ----------
  708. key : str
  709. column : str
  710. The column of interest.
  711. start : int or None, default None
  712. stop : int or None, default None
  713. Raises
  714. ------
  715. raises KeyError if the column is not found (or key is not a valid
  716. store)
  717. raises ValueError if the column can not be extracted individually (it
  718. is part of a data block)
  719. """
  720. tbl = self.get_storer(key)
  721. if not isinstance(tbl, Table):
  722. raise TypeError("can only read_column with a table")
  723. return tbl.read_column(column=column, start=start, stop=stop)
  724. def select_as_multiple(
  725. self,
  726. keys,
  727. where=None,
  728. selector=None,
  729. columns=None,
  730. start=None,
  731. stop=None,
  732. iterator=False,
  733. chunksize=None,
  734. auto_close: bool = False,
  735. ):
  736. """
  737. Retrieve pandas objects from multiple tables.
  738. Parameters
  739. ----------
  740. keys : a list of the tables
  741. selector : the table to apply the where criteria (defaults to keys[0]
  742. if not supplied)
  743. columns : the columns I want back
  744. start : integer (defaults to None), row number to start selection
  745. stop : integer (defaults to None), row number to stop selection
  746. iterator : boolean, return an iterator, default False
  747. chunksize : nrows to include in iteration, return an iterator
  748. auto_close : bool, default False
  749. Should automatically close the store when finished.
  750. Raises
  751. ------
  752. raises KeyError if keys or selector is not found or keys is empty
  753. raises TypeError if keys is not a list or tuple
  754. raises ValueError if the tables are not ALL THE SAME DIMENSIONS
  755. """
  756. # default to single select
  757. where = _ensure_term(where, scope_level=1)
  758. if isinstance(keys, (list, tuple)) and len(keys) == 1:
  759. keys = keys[0]
  760. if isinstance(keys, str):
  761. return self.select(
  762. key=keys,
  763. where=where,
  764. columns=columns,
  765. start=start,
  766. stop=stop,
  767. iterator=iterator,
  768. chunksize=chunksize,
  769. auto_close=auto_close,
  770. )
  771. if not isinstance(keys, (list, tuple)):
  772. raise TypeError("keys must be a list/tuple")
  773. if not len(keys):
  774. raise ValueError("keys must have a non-zero length")
  775. if selector is None:
  776. selector = keys[0]
  777. # collect the tables
  778. tbls = [self.get_storer(k) for k in keys]
  779. s = self.get_storer(selector)
  780. # validate rows
  781. nrows = None
  782. for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
  783. if t is None:
  784. raise KeyError(f"Invalid table [{k}]")
  785. if not t.is_table:
  786. raise TypeError(
  787. f"object [{t.pathname}] is not a table, and cannot be used in all "
  788. "select as multiple"
  789. )
  790. if nrows is None:
  791. nrows = t.nrows
  792. elif t.nrows != nrows:
  793. raise ValueError("all tables must have exactly the same nrows!")
  794. # The isinstance checks here are redundant with the check above,
  795. # but necessary for mypy; see GH#29757
  796. _tbls = [x for x in tbls if isinstance(x, Table)]
  797. # axis is the concentration axes
  798. axis = list({t.non_index_axes[0][0] for t in _tbls})[0]
  799. def func(_start, _stop, _where):
  800. # retrieve the objs, _where is always passed as a set of
  801. # coordinates here
  802. objs = [
  803. t.read(where=_where, columns=columns, start=_start, stop=_stop)
  804. for t in tbls
  805. ]
  806. # concat and return
  807. return concat(objs, axis=axis, verify_integrity=False)._consolidate()
  808. # create the iterator
  809. it = TableIterator(
  810. self,
  811. s,
  812. func,
  813. where=where,
  814. nrows=nrows,
  815. start=start,
  816. stop=stop,
  817. iterator=iterator,
  818. chunksize=chunksize,
  819. auto_close=auto_close,
  820. )
  821. return it.get_result(coordinates=True)
  822. def put(
  823. self,
  824. key: str,
  825. value: FrameOrSeries,
  826. format=None,
  827. index=True,
  828. append=False,
  829. complib=None,
  830. complevel: Optional[int] = None,
  831. min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
  832. nan_rep=None,
  833. data_columns: Optional[List[str]] = None,
  834. encoding=None,
  835. errors: str = "strict",
  836. ):
  837. """
  838. Store object in HDFStore.
  839. Parameters
  840. ----------
  841. key : str
  842. value : {Series, DataFrame}
  843. format : 'fixed(f)|table(t)', default is 'fixed'
  844. fixed(f) : Fixed format
  845. Fast writing/reading. Not-appendable, nor searchable.
  846. table(t) : Table format
  847. Write as a PyTables Table structure which may perform
  848. worse but allow more flexible operations like searching
  849. / selecting subsets of the data.
  850. append : bool, default False
  851. This will force Table format, append the input data to the
  852. existing.
  853. data_columns : list, default None
  854. List of columns to create as data columns, or True to
  855. use all columns. See `here
  856. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  857. encoding : str, default None
  858. Provide an encoding for strings.
  859. dropna : bool, default False, do not write an ALL nan row to
  860. The store settable by the option 'io.hdf.dropna_table'.
  861. """
  862. if format is None:
  863. format = get_option("io.hdf.default_format") or "fixed"
  864. format = self._validate_format(format)
  865. self._write_to_group(
  866. key,
  867. value,
  868. format=format,
  869. index=index,
  870. append=append,
  871. complib=complib,
  872. complevel=complevel,
  873. min_itemsize=min_itemsize,
  874. nan_rep=nan_rep,
  875. data_columns=data_columns,
  876. encoding=encoding,
  877. errors=errors,
  878. )
  879. def remove(self, key: str, where=None, start=None, stop=None):
  880. """
  881. Remove pandas object partially by specifying the where condition
  882. Parameters
  883. ----------
  884. key : string
  885. Node to remove or delete rows from
  886. where : list of Term (or convertible) objects, optional
  887. start : integer (defaults to None), row number to start selection
  888. stop : integer (defaults to None), row number to stop selection
  889. Returns
  890. -------
  891. number of rows removed (or None if not a Table)
  892. Raises
  893. ------
  894. raises KeyError if key is not a valid store
  895. """
  896. where = _ensure_term(where, scope_level=1)
  897. try:
  898. s = self.get_storer(key)
  899. except KeyError:
  900. # the key is not a valid store, re-raising KeyError
  901. raise
  902. except AssertionError:
  903. # surface any assertion errors for e.g. debugging
  904. raise
  905. except Exception:
  906. # In tests we get here with ClosedFileError, TypeError, and
  907. # _table_mod.NoSuchNodeError. TODO: Catch only these?
  908. if where is not None:
  909. raise ValueError(
  910. "trying to remove a node with a non-None where clause!"
  911. )
  912. # we are actually trying to remove a node (with children)
  913. node = self.get_node(key)
  914. if node is not None:
  915. node._f_remove(recursive=True)
  916. return None
  917. # remove the node
  918. if com.all_none(where, start, stop):
  919. s.group._f_remove(recursive=True)
  920. # delete from the table
  921. else:
  922. if not s.is_table:
  923. raise ValueError(
  924. "can only remove with where on objects written as tables"
  925. )
  926. return s.delete(where=where, start=start, stop=stop)
  927. def append(
  928. self,
  929. key: str,
  930. value: FrameOrSeries,
  931. format=None,
  932. axes=None,
  933. index=True,
  934. append=True,
  935. complib=None,
  936. complevel: Optional[int] = None,
  937. columns=None,
  938. min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
  939. nan_rep=None,
  940. chunksize=None,
  941. expectedrows=None,
  942. dropna: Optional[bool] = None,
  943. data_columns: Optional[List[str]] = None,
  944. encoding=None,
  945. errors: str = "strict",
  946. ):
  947. """
  948. Append to Table in file. Node must already exist and be Table
  949. format.
  950. Parameters
  951. ----------
  952. key : str
  953. value : {Series, DataFrame}
  954. format : 'table' is the default
  955. table(t) : table format
  956. Write as a PyTables Table structure which may perform
  957. worse but allow more flexible operations like searching
  958. / selecting subsets of the data.
  959. append : bool, default True
  960. Append the input data to the existing.
  961. data_columns : list of columns, or True, default None
  962. List of columns to create as indexed data columns for on-disk
  963. queries, or True to use all columns. By default only the axes
  964. of the object are indexed. See `here
  965. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  966. min_itemsize : dict of columns that specify minimum string sizes
  967. nan_rep : string to use as string nan representation
  968. chunksize : size to chunk the writing
  969. expectedrows : expected TOTAL row size of this table
  970. encoding : default None, provide an encoding for strings
  971. dropna : bool, default False
  972. Do not write an ALL nan row to the store settable
  973. by the option 'io.hdf.dropna_table'.
  974. Notes
  975. -----
  976. Does *not* check if data being appended overlaps with existing
  977. data in the table, so be careful
  978. """
  979. if columns is not None:
  980. raise TypeError(
  981. "columns is not a supported keyword in append, try data_columns"
  982. )
  983. if dropna is None:
  984. dropna = get_option("io.hdf.dropna_table")
  985. if format is None:
  986. format = get_option("io.hdf.default_format") or "table"
  987. format = self._validate_format(format)
  988. self._write_to_group(
  989. key,
  990. value,
  991. format=format,
  992. axes=axes,
  993. index=index,
  994. append=append,
  995. complib=complib,
  996. complevel=complevel,
  997. min_itemsize=min_itemsize,
  998. nan_rep=nan_rep,
  999. chunksize=chunksize,
  1000. expectedrows=expectedrows,
  1001. dropna=dropna,
  1002. data_columns=data_columns,
  1003. encoding=encoding,
  1004. errors=errors,
  1005. )
  1006. def append_to_multiple(
  1007. self,
  1008. d: Dict,
  1009. value,
  1010. selector,
  1011. data_columns=None,
  1012. axes=None,
  1013. dropna=False,
  1014. **kwargs,
  1015. ):
  1016. """
  1017. Append to multiple tables
  1018. Parameters
  1019. ----------
  1020. d : a dict of table_name to table_columns, None is acceptable as the
  1021. values of one node (this will get all the remaining columns)
  1022. value : a pandas object
  1023. selector : a string that designates the indexable table; all of its
  1024. columns will be designed as data_columns, unless data_columns is
  1025. passed, in which case these are used
  1026. data_columns : list of columns to create as data columns, or True to
  1027. use all columns
  1028. dropna : if evaluates to True, drop rows from all tables if any single
  1029. row in each table has all NaN. Default False.
  1030. Notes
  1031. -----
  1032. axes parameter is currently not accepted
  1033. """
  1034. if axes is not None:
  1035. raise TypeError(
  1036. "axes is currently not accepted as a parameter to append_to_multiple; "
  1037. "you can create the tables independently instead"
  1038. )
  1039. if not isinstance(d, dict):
  1040. raise ValueError(
  1041. "append_to_multiple must have a dictionary specified as the "
  1042. "way to split the value"
  1043. )
  1044. if selector not in d:
  1045. raise ValueError(
  1046. "append_to_multiple requires a selector that is in passed dict"
  1047. )
  1048. # figure out the splitting axis (the non_index_axis)
  1049. axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
  1050. # figure out how to split the value
  1051. remain_key = None
  1052. remain_values: List = []
  1053. for k, v in d.items():
  1054. if v is None:
  1055. if remain_key is not None:
  1056. raise ValueError(
  1057. "append_to_multiple can only have one value in d that "
  1058. "is None"
  1059. )
  1060. remain_key = k
  1061. else:
  1062. remain_values.extend(v)
  1063. if remain_key is not None:
  1064. ordered = value.axes[axis]
  1065. ordd = ordered.difference(Index(remain_values))
  1066. ordd = sorted(ordered.get_indexer(ordd))
  1067. d[remain_key] = ordered.take(ordd)
  1068. # data_columns
  1069. if data_columns is None:
  1070. data_columns = d[selector]
  1071. # ensure rows are synchronized across the tables
  1072. if dropna:
  1073. idxs = (value[cols].dropna(how="all").index for cols in d.values())
  1074. valid_index = next(idxs)
  1075. for index in idxs:
  1076. valid_index = valid_index.intersection(index)
  1077. value = value.loc[valid_index]
  1078. # append
  1079. for k, v in d.items():
  1080. dc = data_columns if k == selector else None
  1081. # compute the val
  1082. val = value.reindex(v, axis=axis)
  1083. self.append(k, val, data_columns=dc, **kwargs)
  1084. def create_table_index(
  1085. self,
  1086. key: str,
  1087. columns=None,
  1088. optlevel: Optional[int] = None,
  1089. kind: Optional[str] = None,
  1090. ):
  1091. """
  1092. Create a pytables index on the table.
  1093. Parameters
  1094. ----------
  1095. key : str
  1096. columns : None, bool, or listlike[str]
  1097. Indicate which columns to create an index on.
  1098. * False : Do not create any indexes.
  1099. * True : Create indexes on all columns.
  1100. * None : Create indexes on all columns.
  1101. * listlike : Create indexes on the given columns.
  1102. optlevel : int or None, default None
  1103. Optimization level, if None, pytables defaults to 6.
  1104. kind : str or None, default None
  1105. Kind of index, if None, pytables defaults to "medium".
  1106. Raises
  1107. ------
  1108. TypeError: raises if the node is not a table
  1109. """
  1110. # version requirements
  1111. _tables()
  1112. s = self.get_storer(key)
  1113. if s is None:
  1114. return
  1115. if not isinstance(s, Table):
  1116. raise TypeError("cannot create table index on a Fixed format store")
  1117. s.create_index(columns=columns, optlevel=optlevel, kind=kind)
  1118. def groups(self):
  1119. """
  1120. Return a list of all the top-level nodes.
  1121. Each node returned is not a pandas storage object.
  1122. Returns
  1123. -------
  1124. list
  1125. List of objects.
  1126. """
  1127. _tables()
  1128. self._check_if_open()
  1129. return [
  1130. g
  1131. for g in self._handle.walk_groups()
  1132. if (
  1133. not isinstance(g, _table_mod.link.Link)
  1134. and (
  1135. getattr(g._v_attrs, "pandas_type", None)
  1136. or getattr(g, "table", None)
  1137. or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
  1138. )
  1139. )
  1140. ]
  1141. def walk(self, where="/"):
  1142. """
  1143. Walk the pytables group hierarchy for pandas objects.
  1144. This generator will yield the group path, subgroups and pandas object
  1145. names for each group.
  1146. Any non-pandas PyTables objects that are not a group will be ignored.
  1147. The `where` group itself is listed first (preorder), then each of its
  1148. child groups (following an alphanumerical order) is also traversed,
  1149. following the same procedure.
  1150. .. versionadded:: 0.24.0
  1151. Parameters
  1152. ----------
  1153. where : str, default "/"
  1154. Group where to start walking.
  1155. Yields
  1156. ------
  1157. path : str
  1158. Full path to a group (without trailing '/').
  1159. groups : list
  1160. Names (strings) of the groups contained in `path`.
  1161. leaves : list
  1162. Names (strings) of the pandas objects contained in `path`.
  1163. """
  1164. _tables()
  1165. self._check_if_open()
  1166. for g in self._handle.walk_groups(where):
  1167. if getattr(g._v_attrs, "pandas_type", None) is not None:
  1168. continue
  1169. groups = []
  1170. leaves = []
  1171. for child in g._v_children.values():
  1172. pandas_type = getattr(child._v_attrs, "pandas_type", None)
  1173. if pandas_type is None:
  1174. if isinstance(child, _table_mod.group.Group):
  1175. groups.append(child._v_name)
  1176. else:
  1177. leaves.append(child._v_name)
  1178. yield (g._v_pathname.rstrip("/"), groups, leaves)
  1179. def get_node(self, key: str) -> Optional["Node"]:
  1180. """ return the node with the key or None if it does not exist """
  1181. self._check_if_open()
  1182. if not key.startswith("/"):
  1183. key = "/" + key
  1184. assert self._handle is not None
  1185. assert _table_mod is not None # for mypy
  1186. try:
  1187. node = self._handle.get_node(self.root, key)
  1188. except _table_mod.exceptions.NoSuchNodeError:
  1189. return None
  1190. assert isinstance(node, _table_mod.Node), type(node)
  1191. return node
  1192. def get_storer(self, key: str) -> Union["GenericFixed", "Table"]:
  1193. """ return the storer object for a key, raise if not in the file """
  1194. group = self.get_node(key)
  1195. if group is None:
  1196. raise KeyError(f"No object named {key} in the file")
  1197. s = self._create_storer(group)
  1198. s.infer_axes()
  1199. return s
  1200. def copy(
  1201. self,
  1202. file,
  1203. mode="w",
  1204. propindexes: bool = True,
  1205. keys=None,
  1206. complib=None,
  1207. complevel: Optional[int] = None,
  1208. fletcher32: bool = False,
  1209. overwrite=True,
  1210. ):
  1211. """
  1212. Copy the existing store to a new file, updating in place.
  1213. Parameters
  1214. ----------
  1215. propindexes: bool, default True
  1216. Restore indexes in copied file.
  1217. keys : list of keys to include in the copy (defaults to all)
  1218. overwrite : overwrite (remove and replace) existing nodes in the
  1219. new store (default is True)
  1220. mode, complib, complevel, fletcher32 same as in HDFStore.__init__
  1221. Returns
  1222. -------
  1223. open file handle of the new store
  1224. """
  1225. new_store = HDFStore(
  1226. file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
  1227. )
  1228. if keys is None:
  1229. keys = list(self.keys())
  1230. if not isinstance(keys, (tuple, list)):
  1231. keys = [keys]
  1232. for k in keys:
  1233. s = self.get_storer(k)
  1234. if s is not None:
  1235. if k in new_store:
  1236. if overwrite:
  1237. new_store.remove(k)
  1238. data = self.select(k)
  1239. if isinstance(s, Table):
  1240. index: Union[bool, List[str]] = False
  1241. if propindexes:
  1242. index = [a.name for a in s.axes if a.is_indexed]
  1243. new_store.append(
  1244. k,
  1245. data,
  1246. index=index,
  1247. data_columns=getattr(s, "data_columns", None),
  1248. encoding=s.encoding,
  1249. )
  1250. else:
  1251. new_store.put(k, data, encoding=s.encoding)
  1252. return new_store
  1253. def info(self) -> str:
  1254. """
  1255. Print detailed information on the store.
  1256. .. versionadded:: 0.21.0
  1257. Returns
  1258. -------
  1259. str
  1260. """
  1261. path = pprint_thing(self._path)
  1262. output = f"{type(self)}\nFile path: {path}\n"
  1263. if self.is_open:
  1264. lkeys = sorted(self.keys())
  1265. if len(lkeys):
  1266. keys = []
  1267. values = []
  1268. for k in lkeys:
  1269. try:
  1270. s = self.get_storer(k)
  1271. if s is not None:
  1272. keys.append(pprint_thing(s.pathname or k))
  1273. values.append(pprint_thing(s or "invalid_HDFStore node"))
  1274. except AssertionError:
  1275. # surface any assertion errors for e.g. debugging
  1276. raise
  1277. except Exception as detail:
  1278. keys.append(k)
  1279. dstr = pprint_thing(detail)
  1280. values.append(f"[invalid_HDFStore node: {dstr}]")
  1281. output += adjoin(12, keys, values)
  1282. else:
  1283. output += "Empty"
  1284. else:
  1285. output += "File is CLOSED"
  1286. return output
  1287. # ------------------------------------------------------------------------
  1288. # private methods
  1289. def _check_if_open(self):
  1290. if not self.is_open:
  1291. raise ClosedFileError(f"{self._path} file is not open!")
  1292. def _validate_format(self, format: str) -> str:
  1293. """ validate / deprecate formats """
  1294. # validate
  1295. try:
  1296. format = _FORMAT_MAP[format.lower()]
  1297. except KeyError:
  1298. raise TypeError(f"invalid HDFStore format specified [{format}]")
  1299. return format
  1300. def _create_storer(
  1301. self,
  1302. group,
  1303. format=None,
  1304. value: Optional[FrameOrSeries] = None,
  1305. encoding: str = "UTF-8",
  1306. errors: str = "strict",
  1307. ) -> Union["GenericFixed", "Table"]:
  1308. """ return a suitable class to operate """
  1309. cls: Union[Type["GenericFixed"], Type["Table"]]
  1310. if value is not None and not isinstance(value, (Series, DataFrame)):
  1311. raise TypeError("value must be None, Series, or DataFrame")
  1312. def error(t):
  1313. # return instead of raising so mypy can tell where we are raising
  1314. return TypeError(
  1315. f"cannot properly create the storer for: [{t}] [group->"
  1316. f"{group},value->{type(value)},format->{format}"
  1317. )
  1318. pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
  1319. tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
  1320. # infer the pt from the passed value
  1321. if pt is None:
  1322. if value is None:
  1323. _tables()
  1324. assert _table_mod is not None # for mypy
  1325. if getattr(group, "table", None) or isinstance(
  1326. group, _table_mod.table.Table
  1327. ):
  1328. pt = "frame_table"
  1329. tt = "generic_table"
  1330. else:
  1331. raise TypeError(
  1332. "cannot create a storer if the object is not existing "
  1333. "nor a value are passed"
  1334. )
  1335. else:
  1336. _TYPE_MAP = {Series: "series", DataFrame: "frame"}
  1337. pt = _TYPE_MAP[type(value)]
  1338. # we are actually a table
  1339. if format == "table":
  1340. pt += "_table"
  1341. # a storer node
  1342. if "table" not in pt:
  1343. _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
  1344. try:
  1345. cls = _STORER_MAP[pt]
  1346. except KeyError:
  1347. raise error("_STORER_MAP")
  1348. return cls(self, group, encoding=encoding, errors=errors)
  1349. # existing node (and must be a table)
  1350. if tt is None:
  1351. # if we are a writer, determine the tt
  1352. if value is not None:
  1353. if pt == "series_table":
  1354. index = getattr(value, "index", None)
  1355. if index is not None:
  1356. if index.nlevels == 1:
  1357. tt = "appendable_series"
  1358. elif index.nlevels > 1:
  1359. tt = "appendable_multiseries"
  1360. elif pt == "frame_table":
  1361. index = getattr(value, "index", None)
  1362. if index is not None:
  1363. if index.nlevels == 1:
  1364. tt = "appendable_frame"
  1365. elif index.nlevels > 1:
  1366. tt = "appendable_multiframe"
  1367. _TABLE_MAP = {
  1368. "generic_table": GenericTable,
  1369. "appendable_series": AppendableSeriesTable,
  1370. "appendable_multiseries": AppendableMultiSeriesTable,
  1371. "appendable_frame": AppendableFrameTable,
  1372. "appendable_multiframe": AppendableMultiFrameTable,
  1373. "worm": WORMTable,
  1374. }
  1375. try:
  1376. cls = _TABLE_MAP[tt]
  1377. except KeyError:
  1378. raise error("_TABLE_MAP")
  1379. return cls(self, group, encoding=encoding, errors=errors)
  1380. def _write_to_group(
  1381. self,
  1382. key: str,
  1383. value: FrameOrSeries,
  1384. format,
  1385. axes=None,
  1386. index=True,
  1387. append=False,
  1388. complib=None,
  1389. complevel: Optional[int] = None,
  1390. fletcher32=None,
  1391. min_itemsize: Optional[Union[int, Dict[str, int]]] = None,
  1392. chunksize=None,
  1393. expectedrows=None,
  1394. dropna=False,
  1395. nan_rep=None,
  1396. data_columns=None,
  1397. encoding=None,
  1398. errors: str = "strict",
  1399. ):
  1400. group = self.get_node(key)
  1401. # we make this assertion for mypy; the get_node call will already
  1402. # have raised if this is incorrect
  1403. assert self._handle is not None
  1404. # remove the node if we are not appending
  1405. if group is not None and not append:
  1406. self._handle.remove_node(group, recursive=True)
  1407. group = None
  1408. # we don't want to store a table node at all if our object is 0-len
  1409. # as there are not dtypes
  1410. if getattr(value, "empty", None) and (format == "table" or append):
  1411. return
  1412. if group is None:
  1413. paths = key.split("/")
  1414. # recursively create the groups
  1415. path = "/"
  1416. for p in paths:
  1417. if not len(p):
  1418. continue
  1419. new_path = path
  1420. if not path.endswith("/"):
  1421. new_path += "/"
  1422. new_path += p
  1423. group = self.get_node(new_path)
  1424. if group is None:
  1425. group = self._handle.create_group(path, p)
  1426. path = new_path
  1427. s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
  1428. if append:
  1429. # raise if we are trying to append to a Fixed format,
  1430. # or a table that exists (and we are putting)
  1431. if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
  1432. raise ValueError("Can only append to Tables")
  1433. if not s.is_exists:
  1434. s.set_object_info()
  1435. else:
  1436. s.set_object_info()
  1437. if not s.is_table and complib:
  1438. raise ValueError("Compression not supported on Fixed format stores")
  1439. # write the object
  1440. s.write(
  1441. obj=value,
  1442. axes=axes,
  1443. append=append,
  1444. complib=complib,
  1445. complevel=complevel,
  1446. fletcher32=fletcher32,
  1447. min_itemsize=min_itemsize,
  1448. chunksize=chunksize,
  1449. expectedrows=expectedrows,
  1450. dropna=dropna,
  1451. nan_rep=nan_rep,
  1452. data_columns=data_columns,
  1453. )
  1454. if isinstance(s, Table) and index:
  1455. s.create_index(columns=index)
  1456. def _read_group(self, group: "Node"):
  1457. s = self._create_storer(group)
  1458. s.infer_axes()
  1459. return s.read()
  1460. class TableIterator:
  1461. """
  1462. Define the iteration interface on a table
  1463. Parameters
  1464. ----------
  1465. store : HDFStore
  1466. s : the referred storer
  1467. func : the function to execute the query
  1468. where : the where of the query
  1469. nrows : the rows to iterate on
  1470. start : the passed start value (default is None)
  1471. stop : the passed stop value (default is None)
  1472. iterator : bool, default False
  1473. Whether to use the default iterator.
  1474. chunksize : the passed chunking value (default is 100000)
  1475. auto_close : bool, default False
  1476. Whether to automatically close the store at the end of iteration.
  1477. """
  1478. chunksize: Optional[int]
  1479. store: HDFStore
  1480. s: Union["GenericFixed", "Table"]
  1481. def __init__(
  1482. self,
  1483. store: HDFStore,
  1484. s: Union["GenericFixed", "Table"],
  1485. func,
  1486. where,
  1487. nrows,
  1488. start=None,
  1489. stop=None,
  1490. iterator: bool = False,
  1491. chunksize: Optional[int] = None,
  1492. auto_close: bool = False,
  1493. ):
  1494. self.store = store
  1495. self.s = s
  1496. self.func = func
  1497. self.where = where
  1498. # set start/stop if they are not set if we are a table
  1499. if self.s.is_table:
  1500. if nrows is None:
  1501. nrows = 0
  1502. if start is None:
  1503. start = 0
  1504. if stop is None:
  1505. stop = nrows
  1506. stop = min(nrows, stop)
  1507. self.nrows = nrows
  1508. self.start = start
  1509. self.stop = stop
  1510. self.coordinates = None
  1511. if iterator or chunksize is not None:
  1512. if chunksize is None:
  1513. chunksize = 100000
  1514. self.chunksize = int(chunksize)
  1515. else:
  1516. self.chunksize = None
  1517. self.auto_close = auto_close
  1518. def __iter__(self):
  1519. # iterate
  1520. current = self.start
  1521. while current < self.stop:
  1522. stop = min(current + self.chunksize, self.stop)
  1523. value = self.func(None, None, self.coordinates[current:stop])
  1524. current = stop
  1525. if value is None or not len(value):
  1526. continue
  1527. yield value
  1528. self.close()
  1529. def close(self):
  1530. if self.auto_close:
  1531. self.store.close()
  1532. def get_result(self, coordinates: bool = False):
  1533. # return the actual iterator
  1534. if self.chunksize is not None:
  1535. if not isinstance(self.s, Table):
  1536. raise TypeError("can only use an iterator or chunksize on a table")
  1537. self.coordinates = self.s.read_coordinates(where=self.where)
  1538. return self
  1539. # if specified read via coordinates (necessary for multiple selections
  1540. if coordinates:
  1541. if not isinstance(self.s, Table):
  1542. raise TypeError("can only read_coordinates on a table")
  1543. where = self.s.read_coordinates(
  1544. where=self.where, start=self.start, stop=self.stop
  1545. )
  1546. else:
  1547. where = self.where
  1548. # directly return the result
  1549. results = self.func(self.start, self.stop, where)
  1550. self.close()
  1551. return results
  1552. class IndexCol:
  1553. """ an index column description class
  1554. Parameters
  1555. ----------
  1556. axis : axis which I reference
  1557. values : the ndarray like converted values
  1558. kind : a string description of this type
  1559. typ : the pytables type
  1560. pos : the position in the pytables
  1561. """
  1562. is_an_indexable = True
  1563. is_data_indexable = True
  1564. _info_fields = ["freq", "tz", "index_name"]
  1565. name: str
  1566. cname: str
  1567. def __init__(
  1568. self,
  1569. name: str,
  1570. values=None,
  1571. kind=None,
  1572. typ=None,
  1573. cname: Optional[str] = None,
  1574. axis=None,
  1575. pos=None,
  1576. freq=None,
  1577. tz=None,
  1578. index_name=None,
  1579. ordered=None,
  1580. table=None,
  1581. meta=None,
  1582. metadata=None,
  1583. ):
  1584. if not isinstance(name, str):
  1585. raise ValueError("`name` must be a str.")
  1586. self.values = values
  1587. self.kind = kind
  1588. self.typ = typ
  1589. self.name = name
  1590. self.cname = cname or name
  1591. self.axis = axis
  1592. self.pos = pos
  1593. self.freq = freq
  1594. self.tz = tz
  1595. self.index_name = index_name
  1596. self.ordered = ordered
  1597. self.table = table
  1598. self.meta = meta
  1599. self.metadata = metadata
  1600. if pos is not None:
  1601. self.set_pos(pos)
  1602. # These are ensured as long as the passed arguments match the
  1603. # constructor annotations.
  1604. assert isinstance(self.name, str)
  1605. assert isinstance(self.cname, str)
  1606. @property
  1607. def itemsize(self) -> int:
  1608. # Assumes self.typ has already been initialized
  1609. return self.typ.itemsize
  1610. @property
  1611. def kind_attr(self) -> str:
  1612. return f"{self.name}_kind"
  1613. def set_pos(self, pos: int):
  1614. """ set the position of this column in the Table """
  1615. self.pos = pos
  1616. if pos is not None and self.typ is not None:
  1617. self.typ._v_pos = pos
  1618. def __repr__(self) -> str:
  1619. temp = tuple(
  1620. map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
  1621. )
  1622. return ",".join(
  1623. (
  1624. f"{key}->{value}"
  1625. for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
  1626. )
  1627. )
  1628. def __eq__(self, other: Any) -> bool:
  1629. """ compare 2 col items """
  1630. return all(
  1631. getattr(self, a, None) == getattr(other, a, None)
  1632. for a in ["name", "cname", "axis", "pos"]
  1633. )
  1634. def __ne__(self, other) -> bool:
  1635. return not self.__eq__(other)
  1636. @property
  1637. def is_indexed(self) -> bool:
  1638. """ return whether I am an indexed column """
  1639. if not hasattr(self.table, "cols"):
  1640. # e.g. if infer hasn't been called yet, self.table will be None.
  1641. return False
  1642. # GH#29692 mypy doesn't recognize self.table as having a "cols" attribute
  1643. # 'error: "None" has no attribute "cols"'
  1644. return getattr(self.table.cols, self.cname).is_indexed # type: ignore
  1645. def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
  1646. """
  1647. Convert the data from this selection to the appropriate pandas type.
  1648. """
  1649. assert isinstance(values, np.ndarray), type(values)
  1650. # values is a recarray
  1651. if values.dtype.fields is not None:
  1652. values = values[self.cname]
  1653. val_kind = _ensure_decoded(self.kind)
  1654. values = _maybe_convert(values, val_kind, encoding, errors)
  1655. kwargs = dict()
  1656. kwargs["name"] = _ensure_decoded(self.index_name)
  1657. if self.freq is not None:
  1658. kwargs["freq"] = _ensure_decoded(self.freq)
  1659. # making an Index instance could throw a number of different errors
  1660. try:
  1661. new_pd_index = Index(values, **kwargs)
  1662. except ValueError:
  1663. # if the output freq is different that what we recorded,
  1664. # it should be None (see also 'doc example part 2')
  1665. if "freq" in kwargs:
  1666. kwargs["freq"] = None
  1667. new_pd_index = Index(values, **kwargs)
  1668. new_pd_index = _set_tz(new_pd_index, self.tz)
  1669. return new_pd_index, new_pd_index
  1670. def take_data(self):
  1671. """ return the values"""
  1672. return self.values
  1673. @property
  1674. def attrs(self):
  1675. return self.table._v_attrs
  1676. @property
  1677. def description(self):
  1678. return self.table.description
  1679. @property
  1680. def col(self):
  1681. """ return my current col description """
  1682. return getattr(self.description, self.cname, None)
  1683. @property
  1684. def cvalues(self):
  1685. """ return my cython values """
  1686. return self.values
  1687. def __iter__(self):
  1688. return iter(self.values)
  1689. def maybe_set_size(self, min_itemsize=None):
  1690. """ maybe set a string col itemsize:
  1691. min_itemsize can be an integer or a dict with this columns name
  1692. with an integer size """
  1693. if _ensure_decoded(self.kind) == "string":
  1694. if isinstance(min_itemsize, dict):
  1695. min_itemsize = min_itemsize.get(self.name)
  1696. if min_itemsize is not None and self.typ.itemsize < min_itemsize:
  1697. self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
  1698. def validate_names(self):
  1699. pass
  1700. def validate_and_set(self, handler: "AppendableTable", append: bool):
  1701. self.table = handler.table
  1702. self.validate_col()
  1703. self.validate_attr(append)
  1704. self.validate_metadata(handler)
  1705. self.write_metadata(handler)
  1706. self.set_attr()
  1707. def validate_col(self, itemsize=None):
  1708. """ validate this column: return the compared against itemsize """
  1709. # validate this column for string truncation (or reset to the max size)
  1710. if _ensure_decoded(self.kind) == "string":
  1711. c = self.col
  1712. if c is not None:
  1713. if itemsize is None:
  1714. itemsize = self.itemsize
  1715. if c.itemsize < itemsize:
  1716. raise ValueError(
  1717. f"Trying to store a string with len [{itemsize}] in "
  1718. f"[{self.cname}] column but\nthis column has a limit of "
  1719. f"[{c.itemsize}]!\nConsider using min_itemsize to "
  1720. "preset the sizes on these columns"
  1721. )
  1722. return c.itemsize
  1723. return None
  1724. def validate_attr(self, append: bool):
  1725. # check for backwards incompatibility
  1726. if append:
  1727. existing_kind = getattr(self.attrs, self.kind_attr, None)
  1728. if existing_kind is not None and existing_kind != self.kind:
  1729. raise TypeError(
  1730. f"incompatible kind in col [{existing_kind} - {self.kind}]"
  1731. )
  1732. def update_info(self, info):
  1733. """ set/update the info for this indexable with the key/value
  1734. if there is a conflict raise/warn as needed """
  1735. for key in self._info_fields:
  1736. value = getattr(self, key, None)
  1737. idx = info.setdefault(self.name, {})
  1738. existing_value = idx.get(key)
  1739. if key in idx and value is not None and existing_value != value:
  1740. # frequency/name just warn
  1741. if key in ["freq", "index_name"]:
  1742. ws = attribute_conflict_doc % (key, existing_value, value)
  1743. warnings.warn(ws, AttributeConflictWarning, stacklevel=6)
  1744. # reset
  1745. idx[key] = None
  1746. setattr(self, key, None)
  1747. else:
  1748. raise ValueError(
  1749. f"invalid info for [{self.name}] for [{key}], "
  1750. f"existing_value [{existing_value}] conflicts with "
  1751. f"new value [{value}]"
  1752. )
  1753. else:
  1754. if value is not None or existing_value is not None:
  1755. idx[key] = value
  1756. def set_info(self, info):
  1757. """ set my state from the passed info """
  1758. idx = info.get(self.name)
  1759. if idx is not None:
  1760. self.__dict__.update(idx)
  1761. def set_attr(self):
  1762. """ set the kind for this column """
  1763. setattr(self.attrs, self.kind_attr, self.kind)
  1764. def validate_metadata(self, handler: "AppendableTable"):
  1765. """ validate that kind=category does not change the categories """
  1766. if self.meta == "category":
  1767. new_metadata = self.metadata
  1768. cur_metadata = handler.read_metadata(self.cname)
  1769. if (
  1770. new_metadata is not None
  1771. and cur_metadata is not None
  1772. and not array_equivalent(new_metadata, cur_metadata)
  1773. ):
  1774. raise ValueError(
  1775. "cannot append a categorical with "
  1776. "different categories to the existing"
  1777. )
  1778. def write_metadata(self, handler: "AppendableTable"):
  1779. """ set the meta data """
  1780. if self.metadata is not None:
  1781. handler.write_metadata(self.cname, self.metadata)
  1782. class GenericIndexCol(IndexCol):
  1783. """ an index which is not represented in the data of the table """
  1784. @property
  1785. def is_indexed(self) -> bool:
  1786. return False
  1787. def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
  1788. """
  1789. Convert the data from this selection to the appropriate pandas type.
  1790. Parameters
  1791. ----------
  1792. values : np.ndarray
  1793. nan_rep : str
  1794. encoding : str
  1795. errors : str
  1796. """
  1797. assert isinstance(values, np.ndarray), type(values)
  1798. values = Int64Index(np.arange(len(values)))
  1799. return values, values
  1800. def set_attr(self):
  1801. pass
  1802. class DataCol(IndexCol):
  1803. """ a data holding column, by definition this is not indexable
  1804. Parameters
  1805. ----------
  1806. data : the actual data
  1807. cname : the column name in the table to hold the data (typically
  1808. values)
  1809. meta : a string description of the metadata
  1810. metadata : the actual metadata
  1811. """
  1812. is_an_indexable = False
  1813. is_data_indexable = False
  1814. _info_fields = ["tz", "ordered"]
  1815. def __init__(
  1816. self,
  1817. name: str,
  1818. values=None,
  1819. kind=None,
  1820. typ=None,
  1821. cname=None,
  1822. pos=None,
  1823. tz=None,
  1824. ordered=None,
  1825. table=None,
  1826. meta=None,
  1827. metadata=None,
  1828. dtype=None,
  1829. data=None,
  1830. ):
  1831. super().__init__(
  1832. name=name,
  1833. values=values,
  1834. kind=kind,
  1835. typ=typ,
  1836. pos=pos,
  1837. cname=cname,
  1838. tz=tz,
  1839. ordered=ordered,
  1840. table=table,
  1841. meta=meta,
  1842. metadata=metadata,
  1843. )
  1844. self.dtype = dtype
  1845. self.data = data
  1846. @property
  1847. def dtype_attr(self) -> str:
  1848. return f"{self.name}_dtype"
  1849. @property
  1850. def meta_attr(self) -> str:
  1851. return f"{self.name}_meta"
  1852. def __repr__(self) -> str:
  1853. temp = tuple(
  1854. map(
  1855. pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
  1856. )
  1857. )
  1858. return ",".join(
  1859. (
  1860. f"{key}->{value}"
  1861. for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
  1862. )
  1863. )
  1864. def __eq__(self, other: Any) -> bool:
  1865. """ compare 2 col items """
  1866. return all(
  1867. getattr(self, a, None) == getattr(other, a, None)
  1868. for a in ["name", "cname", "dtype", "pos"]
  1869. )
  1870. def set_data(self, data: Union[np.ndarray, ABCExtensionArray]):
  1871. assert data is not None
  1872. assert self.dtype is None
  1873. data, dtype_name = _get_data_and_dtype_name(data)
  1874. self.data = data
  1875. self.dtype = dtype_name
  1876. self.kind = _dtype_to_kind(dtype_name)
  1877. def take_data(self):
  1878. """ return the data """
  1879. return self.data
  1880. @classmethod
  1881. def _get_atom(cls, values: Union[np.ndarray, ABCExtensionArray]) -> "Col":
  1882. """
  1883. Get an appropriately typed and shaped pytables.Col object for values.
  1884. """
  1885. dtype = values.dtype
  1886. itemsize = dtype.itemsize
  1887. shape = values.shape
  1888. if values.ndim == 1:
  1889. # EA, use block shape pretending it is 2D
  1890. shape = (1, values.size)
  1891. if is_categorical_dtype(dtype):
  1892. codes = values.codes
  1893. atom = cls.get_atom_data(shape, kind=codes.dtype.name)
  1894. elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
  1895. atom = cls.get_atom_datetime64(shape)
  1896. elif is_timedelta64_dtype(dtype):
  1897. atom = cls.get_atom_timedelta64(shape)
  1898. elif is_complex_dtype(dtype):
  1899. atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
  1900. elif is_string_dtype(dtype):
  1901. atom = cls.get_atom_string(shape, itemsize)
  1902. else:
  1903. atom = cls.get_atom_data(shape, kind=dtype.name)
  1904. return atom
  1905. @classmethod
  1906. def get_atom_string(cls, shape, itemsize):
  1907. return _tables().StringCol(itemsize=itemsize, shape=shape[0])
  1908. @classmethod
  1909. def get_atom_coltype(cls, kind: str) -> Type["Col"]:
  1910. """ return the PyTables column class for this column """
  1911. if kind.startswith("uint"):
  1912. k4 = kind[4:]
  1913. col_name = f"UInt{k4}Col"
  1914. elif kind.startswith("period"):
  1915. # we store as integer
  1916. col_name = "Int64Col"
  1917. else:
  1918. kcap = kind.capitalize()
  1919. col_name = f"{kcap}Col"
  1920. return getattr(_tables(), col_name)
  1921. @classmethod
  1922. def get_atom_data(cls, shape, kind: str) -> "Col":
  1923. return cls.get_atom_coltype(kind=kind)(shape=shape[0])
  1924. @classmethod
  1925. def get_atom_datetime64(cls, shape):
  1926. return _tables().Int64Col(shape=shape[0])
  1927. @classmethod
  1928. def get_atom_timedelta64(cls, shape):
  1929. return _tables().Int64Col(shape=shape[0])
  1930. @property
  1931. def shape(self):
  1932. return getattr(self.data, "shape", None)
  1933. @property
  1934. def cvalues(self):
  1935. """ return my cython values """
  1936. return self.data
  1937. def validate_attr(self, append):
  1938. """validate that we have the same order as the existing & same dtype"""
  1939. if append:
  1940. existing_fields = getattr(self.attrs, self.kind_attr, None)
  1941. if existing_fields is not None and existing_fields != list(self.values):
  1942. raise ValueError("appended items do not match existing items in table!")
  1943. existing_dtype = getattr(self.attrs, self.dtype_attr, None)
  1944. if existing_dtype is not None and existing_dtype != self.dtype:
  1945. raise ValueError(
  1946. "appended items dtype do not match existing "
  1947. "items dtype in table!"
  1948. )
  1949. def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
  1950. """
  1951. Convert the data from this selection to the appropriate pandas type.
  1952. Parameters
  1953. ----------
  1954. values : np.ndarray
  1955. nan_rep :
  1956. encoding : str
  1957. errors : str
  1958. Returns
  1959. -------
  1960. index : listlike to become an Index
  1961. data : ndarraylike to become a column
  1962. """
  1963. assert isinstance(values, np.ndarray), type(values)
  1964. # values is a recarray
  1965. if values.dtype.fields is not None:
  1966. values = values[self.cname]
  1967. assert self.typ is not None
  1968. if self.dtype is None:
  1969. # Note: in tests we never have timedelta64 or datetime64,
  1970. # so the _get_data_and_dtype_name may be unnecessary
  1971. converted, dtype_name = _get_data_and_dtype_name(values)
  1972. kind = _dtype_to_kind(dtype_name)
  1973. else:
  1974. converted = values
  1975. dtype_name = self.dtype
  1976. kind = self.kind
  1977. assert isinstance(converted, np.ndarray) # for mypy
  1978. # use the meta if needed
  1979. meta = _ensure_decoded(self.meta)
  1980. metadata = self.metadata
  1981. ordered = self.ordered
  1982. tz = self.tz
  1983. assert dtype_name is not None
  1984. # convert to the correct dtype
  1985. dtype = _ensure_decoded(dtype_name)
  1986. # reverse converts
  1987. if dtype == "datetime64":
  1988. # recreate with tz if indicated
  1989. converted = _set_tz(converted, tz, coerce=True)
  1990. elif dtype == "timedelta64":
  1991. converted = np.asarray(converted, dtype="m8[ns]")
  1992. elif dtype == "date":
  1993. try:
  1994. converted = np.asarray(
  1995. [date.fromordinal(v) for v in converted], dtype=object
  1996. )
  1997. except ValueError:
  1998. converted = np.asarray(
  1999. [date.fromtimestamp(v) for v in converted], dtype=object
  2000. )
  2001. elif meta == "category":
  2002. # we have a categorical
  2003. categories = metadata
  2004. codes = converted.ravel()
  2005. # if we have stored a NaN in the categories
  2006. # then strip it; in theory we could have BOTH
  2007. # -1s in the codes and nulls :<
  2008. if categories is None:
  2009. # Handle case of NaN-only categorical columns in which case
  2010. # the categories are an empty array; when this is stored,
  2011. # pytables cannot write a zero-len array, so on readback
  2012. # the categories would be None and `read_hdf()` would fail.
  2013. categories = Index([], dtype=np.float64)
  2014. else:
  2015. mask = isna(categories)
  2016. if mask.any():
  2017. categories = categories[~mask]
  2018. codes[codes != -1] -= mask.astype(int).cumsum().values
  2019. converted = Categorical.from_codes(
  2020. codes, categories=categories, ordered=ordered
  2021. )
  2022. else:
  2023. try:
  2024. converted = converted.astype(dtype, copy=False)
  2025. except TypeError:
  2026. converted = converted.astype("O", copy=False)
  2027. # convert nans / decode
  2028. if _ensure_decoded(kind) == "string":
  2029. converted = _unconvert_string_array(
  2030. converted, nan_rep=nan_rep, encoding=encoding, errors=errors
  2031. )
  2032. return self.values, converted
  2033. def set_attr(self):
  2034. """ set the data for this column """
  2035. setattr(self.attrs, self.kind_attr, self.values)
  2036. setattr(self.attrs, self.meta_attr, self.meta)
  2037. assert self.dtype is not None
  2038. setattr(self.attrs, self.dtype_attr, self.dtype)
  2039. class DataIndexableCol(DataCol):
  2040. """ represent a data column that can be indexed """
  2041. is_data_indexable = True
  2042. def validate_names(self):
  2043. if not Index(self.values).is_object():
  2044. # TODO: should the message here be more specifically non-str?
  2045. raise ValueError("cannot have non-object label DataIndexableCol")
  2046. @classmethod
  2047. def get_atom_string(cls, shape, itemsize):
  2048. return _tables().StringCol(itemsize=itemsize)
  2049. @classmethod
  2050. def get_atom_data(cls, shape, kind: str) -> "Col":
  2051. return cls.get_atom_coltype(kind=kind)()
  2052. @classmethod
  2053. def get_atom_datetime64(cls, shape):
  2054. return _tables().Int64Col()
  2055. @classmethod
  2056. def get_atom_timedelta64(cls, shape):
  2057. return _tables().Int64Col()
  2058. class GenericDataIndexableCol(DataIndexableCol):
  2059. """ represent a generic pytables data column """
  2060. pass
  2061. class Fixed:
  2062. """ represent an object in my store
  2063. facilitate read/write of various types of objects
  2064. this is an abstract base class
  2065. Parameters
  2066. ----------
  2067. parent : HDFStore
  2068. group : Node
  2069. The group node where the table resides.
  2070. """
  2071. pandas_kind: str
  2072. format_type: str = "fixed" # GH#30962 needed by dask
  2073. obj_type: Type[Union[DataFrame, Series]]
  2074. ndim: int
  2075. encoding: str
  2076. parent: HDFStore
  2077. group: "Node"
  2078. errors: str
  2079. is_table = False
  2080. def __init__(
  2081. self,
  2082. parent: HDFStore,
  2083. group: "Node",
  2084. encoding: str = "UTF-8",
  2085. errors: str = "strict",
  2086. ):
  2087. assert isinstance(parent, HDFStore), type(parent)
  2088. assert _table_mod is not None # needed for mypy
  2089. assert isinstance(group, _table_mod.Node), type(group)
  2090. self.parent = parent
  2091. self.group = group
  2092. self.encoding = _ensure_encoding(encoding)
  2093. self.errors = errors
  2094. @property
  2095. def is_old_version(self) -> bool:
  2096. return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
  2097. @property
  2098. def version(self) -> Tuple[int, int, int]:
  2099. """ compute and set our version """
  2100. version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
  2101. try:
  2102. version = tuple(int(x) for x in version.split("."))
  2103. if len(version) == 2:
  2104. version = version + (0,)
  2105. except AttributeError:
  2106. version = (0, 0, 0)
  2107. return version
  2108. @property
  2109. def pandas_type(self):
  2110. return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
  2111. def __repr__(self) -> str:
  2112. """ return a pretty representation of myself """
  2113. self.infer_axes()
  2114. s = self.shape
  2115. if s is not None:
  2116. if isinstance(s, (list, tuple)):
  2117. jshape = ",".join(pprint_thing(x) for x in s)
  2118. s = f"[{jshape}]"
  2119. return f"{self.pandas_type:12.12} (shape->{s})"
  2120. return self.pandas_type
  2121. def set_object_info(self):
  2122. """ set my pandas type & version """
  2123. self.attrs.pandas_type = str(self.pandas_kind)
  2124. self.attrs.pandas_version = str(_version)
  2125. def copy(self):
  2126. new_self = copy.copy(self)
  2127. return new_self
  2128. @property
  2129. def shape(self):
  2130. return self.nrows
  2131. @property
  2132. def pathname(self):
  2133. return self.group._v_pathname
  2134. @property
  2135. def _handle(self):
  2136. return self.parent._handle
  2137. @property
  2138. def _filters(self):
  2139. return self.parent._filters
  2140. @property
  2141. def _complevel(self) -> int:
  2142. return self.parent._complevel
  2143. @property
  2144. def _fletcher32(self) -> bool:
  2145. return self.parent._fletcher32
  2146. @property
  2147. def attrs(self):
  2148. return self.group._v_attrs
  2149. def set_attrs(self):
  2150. """ set our object attributes """
  2151. pass
  2152. def get_attrs(self):
  2153. """ get our object attributes """
  2154. pass
  2155. @property
  2156. def storable(self):
  2157. """ return my storable """
  2158. return self.group
  2159. @property
  2160. def is_exists(self) -> bool:
  2161. return False
  2162. @property
  2163. def nrows(self):
  2164. return getattr(self.storable, "nrows", None)
  2165. def validate(self, other):
  2166. """ validate against an existing storable """
  2167. if other is None:
  2168. return
  2169. return True
  2170. def validate_version(self, where=None):
  2171. """ are we trying to operate on an old version? """
  2172. return True
  2173. def infer_axes(self):
  2174. """ infer the axes of my storer
  2175. return a boolean indicating if we have a valid storer or not """
  2176. s = self.storable
  2177. if s is None:
  2178. return False
  2179. self.get_attrs()
  2180. return True
  2181. def read(
  2182. self,
  2183. where=None,
  2184. columns=None,
  2185. start: Optional[int] = None,
  2186. stop: Optional[int] = None,
  2187. ):
  2188. raise NotImplementedError(
  2189. "cannot read on an abstract storer: subclasses should implement"
  2190. )
  2191. def write(self, **kwargs):
  2192. raise NotImplementedError(
  2193. "cannot write on an abstract storer: subclasses should implement"
  2194. )
  2195. def delete(
  2196. self, where=None, start: Optional[int] = None, stop: Optional[int] = None
  2197. ):
  2198. """
  2199. support fully deleting the node in its entirety (only) - where
  2200. specification must be None
  2201. """
  2202. if com.all_none(where, start, stop):
  2203. self._handle.remove_node(self.group, recursive=True)
  2204. return None
  2205. raise TypeError("cannot delete on an abstract storer")
  2206. class GenericFixed(Fixed):
  2207. """ a generified fixed version """
  2208. _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
  2209. _reverse_index_map = {v: k for k, v in _index_type_map.items()}
  2210. attributes: List[str] = []
  2211. # indexer helpders
  2212. def _class_to_alias(self, cls) -> str:
  2213. return self._index_type_map.get(cls, "")
  2214. def _alias_to_class(self, alias):
  2215. if isinstance(alias, type): # pragma: no cover
  2216. # compat: for a short period of time master stored types
  2217. return alias
  2218. return self._reverse_index_map.get(alias, Index)
  2219. def _get_index_factory(self, klass):
  2220. if klass == DatetimeIndex:
  2221. def f(values, freq=None, tz=None):
  2222. # data are already in UTC, localize and convert if tz present
  2223. result = DatetimeIndex._simple_new(values.values, name=None, freq=freq)
  2224. if tz is not None:
  2225. result = result.tz_localize("UTC").tz_convert(tz)
  2226. return result
  2227. return f
  2228. elif klass == PeriodIndex:
  2229. def f(values, freq=None, tz=None):
  2230. return PeriodIndex._simple_new(values, name=None, freq=freq)
  2231. return f
  2232. return klass
  2233. def validate_read(self, columns, where):
  2234. """
  2235. raise if any keywords are passed which are not-None
  2236. """
  2237. if columns is not None:
  2238. raise TypeError(
  2239. "cannot pass a column specification when reading "
  2240. "a Fixed format store. this store must be "
  2241. "selected in its entirety"
  2242. )
  2243. if where is not None:
  2244. raise TypeError(
  2245. "cannot pass a where specification when reading "
  2246. "from a Fixed format store. this store must be "
  2247. "selected in its entirety"
  2248. )
  2249. @property
  2250. def is_exists(self) -> bool:
  2251. return True
  2252. def set_attrs(self):
  2253. """ set our object attributes """
  2254. self.attrs.encoding = self.encoding
  2255. self.attrs.errors = self.errors
  2256. def get_attrs(self):
  2257. """ retrieve our attributes """
  2258. self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
  2259. self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
  2260. for n in self.attributes:
  2261. setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
  2262. def write(self, obj, **kwargs):
  2263. self.set_attrs()
  2264. def read_array(
  2265. self, key: str, start: Optional[int] = None, stop: Optional[int] = None
  2266. ):
  2267. """ read an array for the specified node (off of group """
  2268. import tables
  2269. node = getattr(self.group, key)
  2270. attrs = node._v_attrs
  2271. transposed = getattr(attrs, "transposed", False)
  2272. if isinstance(node, tables.VLArray):
  2273. ret = node[0][start:stop]
  2274. else:
  2275. dtype = getattr(attrs, "value_type", None)
  2276. shape = getattr(attrs, "shape", None)
  2277. if shape is not None:
  2278. # length 0 axis
  2279. ret = np.empty(shape, dtype=dtype)
  2280. else:
  2281. ret = node[start:stop]
  2282. if dtype == "datetime64":
  2283. # reconstruct a timezone if indicated
  2284. tz = getattr(attrs, "tz", None)
  2285. ret = _set_tz(ret, tz, coerce=True)
  2286. elif dtype == "timedelta64":
  2287. ret = np.asarray(ret, dtype="m8[ns]")
  2288. if transposed:
  2289. return ret.T
  2290. else:
  2291. return ret
  2292. def read_index(
  2293. self, key: str, start: Optional[int] = None, stop: Optional[int] = None
  2294. ) -> Index:
  2295. variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
  2296. if variety == "multi":
  2297. return self.read_multi_index(key, start=start, stop=stop)
  2298. elif variety == "regular":
  2299. node = getattr(self.group, key)
  2300. index = self.read_index_node(node, start=start, stop=stop)
  2301. return index
  2302. else: # pragma: no cover
  2303. raise TypeError(f"unrecognized index variety: {variety}")
  2304. def write_index(self, key: str, index: Index):
  2305. if isinstance(index, MultiIndex):
  2306. setattr(self.attrs, f"{key}_variety", "multi")
  2307. self.write_multi_index(key, index)
  2308. else:
  2309. setattr(self.attrs, f"{key}_variety", "regular")
  2310. converted = _convert_index("index", index, self.encoding, self.errors)
  2311. self.write_array(key, converted.values)
  2312. node = getattr(self.group, key)
  2313. node._v_attrs.kind = converted.kind
  2314. node._v_attrs.name = index.name
  2315. if isinstance(index, (DatetimeIndex, PeriodIndex)):
  2316. node._v_attrs.index_class = self._class_to_alias(type(index))
  2317. if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
  2318. node._v_attrs.freq = index.freq
  2319. if isinstance(index, DatetimeIndex) and index.tz is not None:
  2320. node._v_attrs.tz = _get_tz(index.tz)
  2321. def write_multi_index(self, key: str, index: MultiIndex):
  2322. setattr(self.attrs, f"{key}_nlevels", index.nlevels)
  2323. for i, (lev, level_codes, name) in enumerate(
  2324. zip(index.levels, index.codes, index.names)
  2325. ):
  2326. # write the level
  2327. if is_extension_array_dtype(lev):
  2328. raise NotImplementedError(
  2329. "Saving a MultiIndex with an extension dtype is not supported."
  2330. )
  2331. level_key = f"{key}_level{i}"
  2332. conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
  2333. self.write_array(level_key, conv_level.values)
  2334. node = getattr(self.group, level_key)
  2335. node._v_attrs.kind = conv_level.kind
  2336. node._v_attrs.name = name
  2337. # write the name
  2338. setattr(node._v_attrs, f"{key}_name{name}", name)
  2339. # write the labels
  2340. label_key = f"{key}_label{i}"
  2341. self.write_array(label_key, level_codes)
  2342. def read_multi_index(
  2343. self, key: str, start: Optional[int] = None, stop: Optional[int] = None
  2344. ) -> MultiIndex:
  2345. nlevels = getattr(self.attrs, f"{key}_nlevels")
  2346. levels = []
  2347. codes = []
  2348. names: List[Optional[Hashable]] = []
  2349. for i in range(nlevels):
  2350. level_key = f"{key}_level{i}"
  2351. node = getattr(self.group, level_key)
  2352. lev = self.read_index_node(node, start=start, stop=stop)
  2353. levels.append(lev)
  2354. names.append(lev.name)
  2355. label_key = f"{key}_label{i}"
  2356. level_codes = self.read_array(label_key, start=start, stop=stop)
  2357. codes.append(level_codes)
  2358. return MultiIndex(
  2359. levels=levels, codes=codes, names=names, verify_integrity=True
  2360. )
  2361. def read_index_node(
  2362. self, node: "Node", start: Optional[int] = None, stop: Optional[int] = None
  2363. ) -> Index:
  2364. data = node[start:stop]
  2365. # If the index was an empty array write_array_empty() will
  2366. # have written a sentinel. Here we relace it with the original.
  2367. if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
  2368. data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,)
  2369. kind = _ensure_decoded(node._v_attrs.kind)
  2370. name = None
  2371. if "name" in node._v_attrs:
  2372. name = _ensure_str(node._v_attrs.name)
  2373. name = _ensure_decoded(name)
  2374. index_class = self._alias_to_class(
  2375. _ensure_decoded(getattr(node._v_attrs, "index_class", ""))
  2376. )
  2377. factory = self._get_index_factory(index_class)
  2378. kwargs = {}
  2379. if "freq" in node._v_attrs:
  2380. kwargs["freq"] = node._v_attrs["freq"]
  2381. if "tz" in node._v_attrs:
  2382. if isinstance(node._v_attrs["tz"], bytes):
  2383. # created by python2
  2384. kwargs["tz"] = node._v_attrs["tz"].decode("utf-8")
  2385. else:
  2386. # created by python3
  2387. kwargs["tz"] = node._v_attrs["tz"]
  2388. if kind == "date":
  2389. index = factory(
  2390. _unconvert_index(
  2391. data, kind, encoding=self.encoding, errors=self.errors
  2392. ),
  2393. dtype=object,
  2394. **kwargs,
  2395. )
  2396. else:
  2397. index = factory(
  2398. _unconvert_index(
  2399. data, kind, encoding=self.encoding, errors=self.errors
  2400. ),
  2401. **kwargs,
  2402. )
  2403. index.name = name
  2404. return index
  2405. def write_array_empty(self, key: str, value: ArrayLike):
  2406. """ write a 0-len array """
  2407. # ugly hack for length 0 axes
  2408. arr = np.empty((1,) * value.ndim)
  2409. self._handle.create_array(self.group, key, arr)
  2410. node = getattr(self.group, key)
  2411. node._v_attrs.value_type = str(value.dtype)
  2412. node._v_attrs.shape = value.shape
  2413. def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None):
  2414. # TODO: we only have one test that gets here, the only EA
  2415. # that gets passed is DatetimeArray, and we never have
  2416. # both self._filters and EA
  2417. assert isinstance(value, (np.ndarray, ABCExtensionArray)), type(value)
  2418. if key in self.group:
  2419. self._handle.remove_node(self.group, key)
  2420. # Transform needed to interface with pytables row/col notation
  2421. empty_array = value.size == 0
  2422. transposed = False
  2423. if is_categorical_dtype(value):
  2424. raise NotImplementedError(
  2425. "Cannot store a category dtype in "
  2426. "a HDF5 dataset that uses format="
  2427. '"fixed". Use format="table".'
  2428. )
  2429. if not empty_array:
  2430. if hasattr(value, "T"):
  2431. # ExtensionArrays (1d) may not have transpose.
  2432. value = value.T
  2433. transposed = True
  2434. atom = None
  2435. if self._filters is not None:
  2436. try:
  2437. # get the atom for this datatype
  2438. atom = _tables().Atom.from_dtype(value.dtype)
  2439. except ValueError:
  2440. pass
  2441. if atom is not None:
  2442. # We only get here if self._filters is non-None and
  2443. # the Atom.from_dtype call succeeded
  2444. # create an empty chunked array and fill it from value
  2445. if not empty_array:
  2446. ca = self._handle.create_carray(
  2447. self.group, key, atom, value.shape, filters=self._filters
  2448. )
  2449. ca[:] = value
  2450. else:
  2451. self.write_array_empty(key, value)
  2452. elif value.dtype.type == np.object_:
  2453. # infer the type, warn if we have a non-string type here (for
  2454. # performance)
  2455. inferred_type = lib.infer_dtype(value.ravel(), skipna=False)
  2456. if empty_array:
  2457. pass
  2458. elif inferred_type == "string":
  2459. pass
  2460. else:
  2461. ws = performance_doc % (inferred_type, key, items)
  2462. warnings.warn(ws, PerformanceWarning, stacklevel=7)
  2463. vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
  2464. vlarr.append(value)
  2465. elif empty_array:
  2466. self.write_array_empty(key, value)
  2467. elif is_datetime64_dtype(value.dtype):
  2468. self._handle.create_array(self.group, key, value.view("i8"))
  2469. getattr(self.group, key)._v_attrs.value_type = "datetime64"
  2470. elif is_datetime64tz_dtype(value.dtype):
  2471. # store as UTC
  2472. # with a zone
  2473. self._handle.create_array(self.group, key, value.asi8)
  2474. node = getattr(self.group, key)
  2475. node._v_attrs.tz = _get_tz(value.tz)
  2476. node._v_attrs.value_type = "datetime64"
  2477. elif is_timedelta64_dtype(value.dtype):
  2478. self._handle.create_array(self.group, key, value.view("i8"))
  2479. getattr(self.group, key)._v_attrs.value_type = "timedelta64"
  2480. else:
  2481. self._handle.create_array(self.group, key, value)
  2482. getattr(self.group, key)._v_attrs.transposed = transposed
  2483. class SeriesFixed(GenericFixed):
  2484. pandas_kind = "series"
  2485. attributes = ["name"]
  2486. name: Optional[Hashable]
  2487. @property
  2488. def shape(self):
  2489. try:
  2490. return (len(self.group.values),)
  2491. except (TypeError, AttributeError):
  2492. return None
  2493. def read(
  2494. self,
  2495. where=None,
  2496. columns=None,
  2497. start: Optional[int] = None,
  2498. stop: Optional[int] = None,
  2499. ):
  2500. self.validate_read(columns, where)
  2501. index = self.read_index("index", start=start, stop=stop)
  2502. values = self.read_array("values", start=start, stop=stop)
  2503. return Series(values, index=index, name=self.name)
  2504. def write(self, obj, **kwargs):
  2505. super().write(obj, **kwargs)
  2506. self.write_index("index", obj.index)
  2507. self.write_array("values", obj.values)
  2508. self.attrs.name = obj.name
  2509. class BlockManagerFixed(GenericFixed):
  2510. attributes = ["ndim", "nblocks"]
  2511. nblocks: int
  2512. @property
  2513. def shape(self):
  2514. try:
  2515. ndim = self.ndim
  2516. # items
  2517. items = 0
  2518. for i in range(self.nblocks):
  2519. node = getattr(self.group, f"block{i}_items")
  2520. shape = getattr(node, "shape", None)
  2521. if shape is not None:
  2522. items += shape[0]
  2523. # data shape
  2524. node = self.group.block0_values
  2525. shape = getattr(node, "shape", None)
  2526. if shape is not None:
  2527. shape = list(shape[0 : (ndim - 1)])
  2528. else:
  2529. shape = []
  2530. shape.append(items)
  2531. return shape
  2532. except AttributeError:
  2533. return None
  2534. def read(
  2535. self,
  2536. where=None,
  2537. columns=None,
  2538. start: Optional[int] = None,
  2539. stop: Optional[int] = None,
  2540. ):
  2541. # start, stop applied to rows, so 0th axis only
  2542. self.validate_read(columns, where)
  2543. select_axis = self.obj_type()._get_block_manager_axis(0)
  2544. axes = []
  2545. for i in range(self.ndim):
  2546. _start, _stop = (start, stop) if i == select_axis else (None, None)
  2547. ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
  2548. axes.append(ax)
  2549. items = axes[0]
  2550. dfs = []
  2551. for i in range(self.nblocks):
  2552. blk_items = self.read_index(f"block{i}_items")
  2553. values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
  2554. columns = items[items.get_indexer(blk_items)]
  2555. df = DataFrame(values.T, columns=columns, index=axes[1])
  2556. dfs.append(df)
  2557. if len(dfs) > 0:
  2558. out = concat(dfs, axis=1)
  2559. out = out.reindex(columns=items, copy=False)
  2560. return out
  2561. return DataFrame(columns=axes[0], index=axes[1])
  2562. def write(self, obj, **kwargs):
  2563. super().write(obj, **kwargs)
  2564. data = obj._data
  2565. if not data.is_consolidated():
  2566. data = data.consolidate()
  2567. self.attrs.ndim = data.ndim
  2568. for i, ax in enumerate(data.axes):
  2569. if i == 0:
  2570. if not ax.is_unique:
  2571. raise ValueError("Columns index has to be unique for fixed format")
  2572. self.write_index(f"axis{i}", ax)
  2573. # Supporting mixed-type DataFrame objects...nontrivial
  2574. self.attrs.nblocks = len(data.blocks)
  2575. for i, blk in enumerate(data.blocks):
  2576. # I have no idea why, but writing values before items fixed #2299
  2577. blk_items = data.items.take(blk.mgr_locs)
  2578. self.write_array(f"block{i}_values", blk.values, items=blk_items)
  2579. self.write_index(f"block{i}_items", blk_items)
  2580. class FrameFixed(BlockManagerFixed):
  2581. pandas_kind = "frame"
  2582. obj_type = DataFrame
  2583. class Table(Fixed):
  2584. """ represent a table:
  2585. facilitate read/write of various types of tables
  2586. Attrs in Table Node
  2587. -------------------
  2588. These are attributes that are store in the main table node, they are
  2589. necessary to recreate these tables when read back in.
  2590. index_axes : a list of tuples of the (original indexing axis and
  2591. index column)
  2592. non_index_axes: a list of tuples of the (original index axis and
  2593. columns on a non-indexing axis)
  2594. values_axes : a list of the columns which comprise the data of this
  2595. table
  2596. data_columns : a list of the columns that we are allowing indexing
  2597. (these become single columns in values_axes), or True to force all
  2598. columns
  2599. nan_rep : the string to use for nan representations for string
  2600. objects
  2601. levels : the names of levels
  2602. metadata : the names of the metadata columns
  2603. """
  2604. pandas_kind = "wide_table"
  2605. format_type: str = "table" # GH#30962 needed by dask
  2606. table_type: str
  2607. levels = 1
  2608. is_table = True
  2609. index_axes: List[IndexCol]
  2610. non_index_axes: List[Tuple[int, Any]]
  2611. values_axes: List[DataCol]
  2612. data_columns: List
  2613. metadata: List
  2614. info: Dict
  2615. def __init__(
  2616. self,
  2617. parent: HDFStore,
  2618. group: "Node",
  2619. encoding=None,
  2620. errors: str = "strict",
  2621. index_axes=None,
  2622. non_index_axes=None,
  2623. values_axes=None,
  2624. data_columns=None,
  2625. info=None,
  2626. nan_rep=None,
  2627. ):
  2628. super().__init__(parent, group, encoding=encoding, errors=errors)
  2629. self.index_axes = index_axes or []
  2630. self.non_index_axes = non_index_axes or []
  2631. self.values_axes = values_axes or []
  2632. self.data_columns = data_columns or []
  2633. self.info = info or dict()
  2634. self.nan_rep = nan_rep
  2635. @property
  2636. def table_type_short(self) -> str:
  2637. return self.table_type.split("_")[0]
  2638. def __repr__(self) -> str:
  2639. """ return a pretty representation of myself """
  2640. self.infer_axes()
  2641. jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
  2642. dc = f",dc->[{jdc}]"
  2643. ver = ""
  2644. if self.is_old_version:
  2645. jver = ".".join(str(x) for x in self.version)
  2646. ver = f"[{jver}]"
  2647. jindex_axes = ",".join(a.name for a in self.index_axes)
  2648. return (
  2649. f"{self.pandas_type:12.12}{ver} "
  2650. f"(typ->{self.table_type_short},nrows->{self.nrows},"
  2651. f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
  2652. )
  2653. def __getitem__(self, c: str):
  2654. """ return the axis for c """
  2655. for a in self.axes:
  2656. if c == a.name:
  2657. return a
  2658. return None
  2659. def validate(self, other):
  2660. """ validate against an existing table """
  2661. if other is None:
  2662. return
  2663. if other.table_type != self.table_type:
  2664. raise TypeError(
  2665. "incompatible table_type with existing "
  2666. f"[{other.table_type} - {self.table_type}]"
  2667. )
  2668. for c in ["index_axes", "non_index_axes", "values_axes"]:
  2669. sv = getattr(self, c, None)
  2670. ov = getattr(other, c, None)
  2671. if sv != ov:
  2672. # show the error for the specific axes
  2673. for i, sax in enumerate(sv):
  2674. oax = ov[i]
  2675. if sax != oax:
  2676. raise ValueError(
  2677. f"invalid combination of [{c}] on appending data "
  2678. f"[{sax}] vs current table [{oax}]"
  2679. )
  2680. # should never get here
  2681. raise Exception(
  2682. f"invalid combination of [{c}] on appending data [{sv}] vs "
  2683. f"current table [{ov}]"
  2684. )
  2685. @property
  2686. def is_multi_index(self) -> bool:
  2687. """the levels attribute is 1 or a list in the case of a multi-index"""
  2688. return isinstance(self.levels, list)
  2689. def validate_multiindex(self, obj):
  2690. """validate that we can store the multi-index; reset and return the
  2691. new object
  2692. """
  2693. levels = [
  2694. l if l is not None else f"level_{i}" for i, l in enumerate(obj.index.names)
  2695. ]
  2696. try:
  2697. return obj.reset_index(), levels
  2698. except ValueError:
  2699. raise ValueError(
  2700. "duplicate names/columns in the multi-index when storing as a table"
  2701. )
  2702. @property
  2703. def nrows_expected(self) -> int:
  2704. """ based on our axes, compute the expected nrows """
  2705. return np.prod([i.cvalues.shape[0] for i in self.index_axes])
  2706. @property
  2707. def is_exists(self) -> bool:
  2708. """ has this table been created """
  2709. return "table" in self.group
  2710. @property
  2711. def storable(self):
  2712. return getattr(self.group, "table", None)
  2713. @property
  2714. def table(self):
  2715. """ return the table group (this is my storable) """
  2716. return self.storable
  2717. @property
  2718. def dtype(self):
  2719. return self.table.dtype
  2720. @property
  2721. def description(self):
  2722. return self.table.description
  2723. @property
  2724. def axes(self):
  2725. return itertools.chain(self.index_axes, self.values_axes)
  2726. @property
  2727. def ncols(self) -> int:
  2728. """ the number of total columns in the values axes """
  2729. return sum(len(a.values) for a in self.values_axes)
  2730. @property
  2731. def is_transposed(self) -> bool:
  2732. return False
  2733. @property
  2734. def data_orientation(self):
  2735. """return a tuple of my permutated axes, non_indexable at the front"""
  2736. return tuple(
  2737. itertools.chain(
  2738. [int(a[0]) for a in self.non_index_axes],
  2739. [int(a.axis) for a in self.index_axes],
  2740. )
  2741. )
  2742. def queryables(self) -> Dict[str, Any]:
  2743. """ return a dict of the kinds allowable columns for this object """
  2744. # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
  2745. axis_names = {0: "index", 1: "columns"}
  2746. # compute the values_axes queryables
  2747. d1 = [(a.cname, a) for a in self.index_axes]
  2748. d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
  2749. d3 = [
  2750. (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
  2751. ]
  2752. return dict(d1 + d2 + d3) # type: ignore
  2753. # error: List comprehension has incompatible type
  2754. # List[Tuple[Any, None]]; expected List[Tuple[str, IndexCol]]
  2755. def index_cols(self):
  2756. """ return a list of my index cols """
  2757. # Note: each `i.cname` below is assured to be a str.
  2758. return [(i.axis, i.cname) for i in self.index_axes]
  2759. def values_cols(self) -> List[str]:
  2760. """ return a list of my values cols """
  2761. return [i.cname for i in self.values_axes]
  2762. def _get_metadata_path(self, key: str) -> str:
  2763. """ return the metadata pathname for this key """
  2764. group = self.group._v_pathname
  2765. return f"{group}/meta/{key}/meta"
  2766. def write_metadata(self, key: str, values: np.ndarray):
  2767. """
  2768. Write out a metadata array to the key as a fixed-format Series.
  2769. Parameters
  2770. ----------
  2771. key : str
  2772. values : ndarray
  2773. """
  2774. values = Series(values)
  2775. self.parent.put(
  2776. self._get_metadata_path(key),
  2777. values,
  2778. format="table",
  2779. encoding=self.encoding,
  2780. errors=self.errors,
  2781. nan_rep=self.nan_rep,
  2782. )
  2783. def read_metadata(self, key: str):
  2784. """ return the meta data array for this key """
  2785. if getattr(getattr(self.group, "meta", None), key, None) is not None:
  2786. return self.parent.select(self._get_metadata_path(key))
  2787. return None
  2788. def set_attrs(self):
  2789. """ set our table type & indexables """
  2790. self.attrs.table_type = str(self.table_type)
  2791. self.attrs.index_cols = self.index_cols()
  2792. self.attrs.values_cols = self.values_cols()
  2793. self.attrs.non_index_axes = self.non_index_axes
  2794. self.attrs.data_columns = self.data_columns
  2795. self.attrs.nan_rep = self.nan_rep
  2796. self.attrs.encoding = self.encoding
  2797. self.attrs.errors = self.errors
  2798. self.attrs.levels = self.levels
  2799. self.attrs.info = self.info
  2800. def get_attrs(self):
  2801. """ retrieve our attributes """
  2802. self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
  2803. self.data_columns = getattr(self.attrs, "data_columns", None) or []
  2804. self.info = getattr(self.attrs, "info", None) or dict()
  2805. self.nan_rep = getattr(self.attrs, "nan_rep", None)
  2806. self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
  2807. self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
  2808. self.levels = getattr(self.attrs, "levels", None) or []
  2809. self.index_axes = [a for a in self.indexables if a.is_an_indexable]
  2810. self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
  2811. def validate_version(self, where=None):
  2812. """ are we trying to operate on an old version? """
  2813. if where is not None:
  2814. if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1:
  2815. ws = incompatibility_doc % ".".join([str(x) for x in self.version])
  2816. warnings.warn(ws, IncompatibilityWarning)
  2817. def validate_min_itemsize(self, min_itemsize):
  2818. """validate the min_itemsize doesn't contain items that are not in the
  2819. axes this needs data_columns to be defined
  2820. """
  2821. if min_itemsize is None:
  2822. return
  2823. if not isinstance(min_itemsize, dict):
  2824. return
  2825. q = self.queryables()
  2826. for k, v in min_itemsize.items():
  2827. # ok, apply generally
  2828. if k == "values":
  2829. continue
  2830. if k not in q:
  2831. raise ValueError(
  2832. f"min_itemsize has the key [{k}] which is not an axis or "
  2833. "data_column"
  2834. )
  2835. @cache_readonly
  2836. def indexables(self):
  2837. """ create/cache the indexables if they don't exist """
  2838. _indexables = []
  2839. desc = self.description
  2840. table_attrs = self.table.attrs
  2841. # Note: each of the `name` kwargs below are str, ensured
  2842. # by the definition in index_cols.
  2843. # index columns
  2844. for i, (axis, name) in enumerate(self.attrs.index_cols):
  2845. atom = getattr(desc, name)
  2846. md = self.read_metadata(name)
  2847. meta = "category" if md is not None else None
  2848. kind_attr = f"{name}_kind"
  2849. kind = getattr(table_attrs, kind_attr, None)
  2850. index_col = IndexCol(
  2851. name=name,
  2852. axis=axis,
  2853. pos=i,
  2854. kind=kind,
  2855. typ=atom,
  2856. table=self.table,
  2857. meta=meta,
  2858. metadata=md,
  2859. )
  2860. _indexables.append(index_col)
  2861. # values columns
  2862. dc = set(self.data_columns)
  2863. base_pos = len(_indexables)
  2864. def f(i, c):
  2865. assert isinstance(c, str)
  2866. klass = DataCol
  2867. if c in dc:
  2868. klass = DataIndexableCol
  2869. atom = getattr(desc, c)
  2870. adj_name = _maybe_adjust_name(c, self.version)
  2871. # TODO: why kind_attr here?
  2872. values = getattr(table_attrs, f"{adj_name}_kind", None)
  2873. dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
  2874. kind = _dtype_to_kind(dtype)
  2875. md = self.read_metadata(c)
  2876. # TODO: figure out why these two versions of `meta` dont always match.
  2877. # meta = "category" if md is not None else None
  2878. meta = getattr(table_attrs, f"{adj_name}_meta", None)
  2879. obj = klass(
  2880. name=adj_name,
  2881. cname=c,
  2882. values=values,
  2883. kind=kind,
  2884. pos=base_pos + i,
  2885. typ=atom,
  2886. table=self.table,
  2887. meta=meta,
  2888. metadata=md,
  2889. dtype=dtype,
  2890. )
  2891. return obj
  2892. # Note: the definition of `values_cols` ensures that each
  2893. # `c` below is a str.
  2894. _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
  2895. return _indexables
  2896. def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None):
  2897. """
  2898. Create a pytables index on the specified columns.
  2899. Parameters
  2900. ----------
  2901. columns : None, bool, or listlike[str]
  2902. Indicate which columns to create an index on.
  2903. * False : Do not create any indexes.
  2904. * True : Create indexes on all columns.
  2905. * None : Create indexes on all columns.
  2906. * listlike : Create indexes on the given columns.
  2907. optlevel : int or None, default None
  2908. Optimization level, if None, pytables defaults to 6.
  2909. kind : str or None, default None
  2910. Kind of index, if None, pytables defaults to "medium".
  2911. Raises
  2912. ------
  2913. TypeError if trying to create an index on a complex-type column.
  2914. Notes
  2915. -----
  2916. Cannot index Time64Col or ComplexCol.
  2917. Pytables must be >= 3.0.
  2918. """
  2919. if not self.infer_axes():
  2920. return
  2921. if columns is False:
  2922. return
  2923. # index all indexables and data_columns
  2924. if columns is None or columns is True:
  2925. columns = [a.cname for a in self.axes if a.is_data_indexable]
  2926. if not isinstance(columns, (tuple, list)):
  2927. columns = [columns]
  2928. kw = dict()
  2929. if optlevel is not None:
  2930. kw["optlevel"] = optlevel
  2931. if kind is not None:
  2932. kw["kind"] = kind
  2933. table = self.table
  2934. for c in columns:
  2935. v = getattr(table.cols, c, None)
  2936. if v is not None:
  2937. # remove the index if the kind/optlevel have changed
  2938. if v.is_indexed:
  2939. index = v.index
  2940. cur_optlevel = index.optlevel
  2941. cur_kind = index.kind
  2942. if kind is not None and cur_kind != kind:
  2943. v.remove_index()
  2944. else:
  2945. kw["kind"] = cur_kind
  2946. if optlevel is not None and cur_optlevel != optlevel:
  2947. v.remove_index()
  2948. else:
  2949. kw["optlevel"] = cur_optlevel
  2950. # create the index
  2951. if not v.is_indexed:
  2952. if v.type.startswith("complex"):
  2953. raise TypeError(
  2954. "Columns containing complex values can be stored but "
  2955. "cannot be indexed when using table format. Either use "
  2956. "fixed format, set index=False, or do not include "
  2957. "the columns containing complex values to "
  2958. "data_columns when initializing the table."
  2959. )
  2960. v.create_index(**kw)
  2961. def _read_axes(
  2962. self, where, start: Optional[int] = None, stop: Optional[int] = None
  2963. ) -> List[Tuple[ArrayLike, ArrayLike]]:
  2964. """
  2965. Create the axes sniffed from the table.
  2966. Parameters
  2967. ----------
  2968. where : ???
  2969. start : int or None, default None
  2970. stop : int or None, default None
  2971. Returns
  2972. -------
  2973. List[Tuple[index_values, column_values]]
  2974. """
  2975. # create the selection
  2976. selection = Selection(self, where=where, start=start, stop=stop)
  2977. values = selection.select()
  2978. results = []
  2979. # convert the data
  2980. for a in self.axes:
  2981. a.set_info(self.info)
  2982. res = a.convert(
  2983. values,
  2984. nan_rep=self.nan_rep,
  2985. encoding=self.encoding,
  2986. errors=self.errors,
  2987. )
  2988. results.append(res)
  2989. return results
  2990. @classmethod
  2991. def get_object(cls, obj, transposed: bool):
  2992. """ return the data for this obj """
  2993. return obj
  2994. def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
  2995. """take the input data_columns and min_itemize and create a data
  2996. columns spec
  2997. """
  2998. if not len(non_index_axes):
  2999. return []
  3000. axis, axis_labels = non_index_axes[0]
  3001. info = self.info.get(axis, dict())
  3002. if info.get("type") == "MultiIndex" and data_columns:
  3003. raise ValueError(
  3004. f"cannot use a multi-index on axis [{axis}] with "
  3005. f"data_columns {data_columns}"
  3006. )
  3007. # evaluate the passed data_columns, True == use all columns
  3008. # take only valide axis labels
  3009. if data_columns is True:
  3010. data_columns = list(axis_labels)
  3011. elif data_columns is None:
  3012. data_columns = []
  3013. # if min_itemsize is a dict, add the keys (exclude 'values')
  3014. if isinstance(min_itemsize, dict):
  3015. existing_data_columns = set(data_columns)
  3016. data_columns = list(data_columns) # ensure we do not modify
  3017. data_columns.extend(
  3018. [
  3019. k
  3020. for k in min_itemsize.keys()
  3021. if k != "values" and k not in existing_data_columns
  3022. ]
  3023. )
  3024. # return valid columns in the order of our axis
  3025. return [c for c in data_columns if c in axis_labels]
  3026. def _create_axes(
  3027. self,
  3028. axes,
  3029. obj: DataFrame,
  3030. validate: bool = True,
  3031. nan_rep=None,
  3032. data_columns=None,
  3033. min_itemsize=None,
  3034. ):
  3035. """
  3036. Create and return the axes.
  3037. Parameters
  3038. ----------
  3039. axes: list or None
  3040. The names or numbers of the axes to create.
  3041. obj : DataFrame
  3042. The object to create axes on.
  3043. validate: bool, default True
  3044. Whether to validate the obj against an existing object already written.
  3045. nan_rep :
  3046. A value to use for string column nan_rep.
  3047. data_columns : List[str], True, or None, default None
  3048. Specify the columns that we want to create to allow indexing on.
  3049. * True : Use all available columns.
  3050. * None : Use no columns.
  3051. * List[str] : Use the specified columns.
  3052. min_itemsize: Dict[str, int] or None, default None
  3053. The min itemsize for a column in bytes.
  3054. """
  3055. if not isinstance(obj, DataFrame):
  3056. group = self.group._v_name
  3057. raise TypeError(
  3058. f"cannot properly create the storer for: [group->{group},"
  3059. f"value->{type(obj)}]"
  3060. )
  3061. # set the default axes if needed
  3062. if axes is None:
  3063. axes = [0]
  3064. # map axes to numbers
  3065. axes = [obj._get_axis_number(a) for a in axes]
  3066. # do we have an existing table (if so, use its axes & data_columns)
  3067. if self.infer_axes():
  3068. table_exists = True
  3069. axes = [a.axis for a in self.index_axes]
  3070. data_columns = list(self.data_columns)
  3071. nan_rep = self.nan_rep
  3072. # TODO: do we always have validate=True here?
  3073. else:
  3074. table_exists = False
  3075. new_info = self.info
  3076. assert self.ndim == 2 # with next check, we must have len(axes) == 1
  3077. # currently support on ndim-1 axes
  3078. if len(axes) != self.ndim - 1:
  3079. raise ValueError(
  3080. "currently only support ndim-1 indexers in an AppendableTable"
  3081. )
  3082. # create according to the new data
  3083. new_non_index_axes: List = []
  3084. # nan_representation
  3085. if nan_rep is None:
  3086. nan_rep = "nan"
  3087. # We construct the non-index-axis first, since that alters new_info
  3088. idx = [x for x in [0, 1] if x not in axes][0]
  3089. a = obj.axes[idx]
  3090. # we might be able to change the axes on the appending data if necessary
  3091. append_axis = list(a)
  3092. if table_exists:
  3093. indexer = len(new_non_index_axes) # i.e. 0
  3094. exist_axis = self.non_index_axes[indexer][1]
  3095. if not array_equivalent(np.array(append_axis), np.array(exist_axis)):
  3096. # ahah! -> reindex
  3097. if array_equivalent(
  3098. np.array(sorted(append_axis)), np.array(sorted(exist_axis))
  3099. ):
  3100. append_axis = exist_axis
  3101. # the non_index_axes info
  3102. info = new_info.setdefault(idx, {})
  3103. info["names"] = list(a.names)
  3104. info["type"] = type(a).__name__
  3105. new_non_index_axes.append((idx, append_axis))
  3106. # Now we can construct our new index axis
  3107. idx = axes[0]
  3108. a = obj.axes[idx]
  3109. axis_name = obj._AXIS_NAMES[idx]
  3110. new_index = _convert_index(axis_name, a, self.encoding, self.errors)
  3111. new_index.axis = idx
  3112. # Because we are always 2D, there is only one new_index, so
  3113. # we know it will have pos=0
  3114. new_index.set_pos(0)
  3115. new_index.update_info(new_info)
  3116. new_index.maybe_set_size(min_itemsize) # check for column conflicts
  3117. new_index_axes = [new_index]
  3118. j = len(new_index_axes) # i.e. 1
  3119. assert j == 1
  3120. # reindex by our non_index_axes & compute data_columns
  3121. assert len(new_non_index_axes) == 1
  3122. for a in new_non_index_axes:
  3123. obj = _reindex_axis(obj, a[0], a[1])
  3124. def get_blk_items(mgr, blocks):
  3125. return [mgr.items.take(blk.mgr_locs) for blk in blocks]
  3126. transposed = new_index.axis == 1
  3127. # figure out data_columns and get out blocks
  3128. data_columns = self.validate_data_columns(
  3129. data_columns, min_itemsize, new_non_index_axes
  3130. )
  3131. block_obj = self.get_object(obj, transposed)._consolidate()
  3132. blocks, blk_items = self._get_blocks_and_items(
  3133. block_obj, table_exists, new_non_index_axes, self.values_axes, data_columns
  3134. )
  3135. # add my values
  3136. vaxes = []
  3137. for i, (b, b_items) in enumerate(zip(blocks, blk_items)):
  3138. # shape of the data column are the indexable axes
  3139. klass = DataCol
  3140. name = None
  3141. # we have a data_column
  3142. if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
  3143. klass = DataIndexableCol
  3144. name = b_items[0]
  3145. if not (name is None or isinstance(name, str)):
  3146. # TODO: should the message here be more specifically non-str?
  3147. raise ValueError("cannot have non-object label DataIndexableCol")
  3148. # make sure that we match up the existing columns
  3149. # if we have an existing table
  3150. existing_col: Optional[DataCol]
  3151. if table_exists and validate:
  3152. try:
  3153. existing_col = self.values_axes[i]
  3154. except (IndexError, KeyError):
  3155. raise ValueError(
  3156. f"Incompatible appended table [{blocks}]"
  3157. f"with existing table [{self.values_axes}]"
  3158. )
  3159. else:
  3160. existing_col = None
  3161. new_name = name or f"values_block_{i}"
  3162. data_converted = _maybe_convert_for_string_atom(
  3163. new_name,
  3164. b,
  3165. existing_col=existing_col,
  3166. min_itemsize=min_itemsize,
  3167. nan_rep=nan_rep,
  3168. encoding=self.encoding,
  3169. errors=self.errors,
  3170. )
  3171. adj_name = _maybe_adjust_name(new_name, self.version)
  3172. typ = klass._get_atom(data_converted)
  3173. kind = _dtype_to_kind(data_converted.dtype.name)
  3174. tz = _get_tz(data_converted.tz) if hasattr(data_converted, "tz") else None
  3175. meta = metadata = ordered = None
  3176. if is_categorical_dtype(data_converted):
  3177. ordered = data_converted.ordered
  3178. meta = "category"
  3179. metadata = np.array(data_converted.categories, copy=False).ravel()
  3180. data, dtype_name = _get_data_and_dtype_name(data_converted)
  3181. col = klass(
  3182. name=adj_name,
  3183. cname=new_name,
  3184. values=list(b_items),
  3185. typ=typ,
  3186. pos=j,
  3187. kind=kind,
  3188. tz=tz,
  3189. ordered=ordered,
  3190. meta=meta,
  3191. metadata=metadata,
  3192. dtype=dtype_name,
  3193. data=data,
  3194. )
  3195. col.update_info(new_info)
  3196. vaxes.append(col)
  3197. j += 1
  3198. dcs = [col.name for col in vaxes if col.is_data_indexable]
  3199. new_table = type(self)(
  3200. parent=self.parent,
  3201. group=self.group,
  3202. encoding=self.encoding,
  3203. errors=self.errors,
  3204. index_axes=new_index_axes,
  3205. non_index_axes=new_non_index_axes,
  3206. values_axes=vaxes,
  3207. data_columns=dcs,
  3208. info=new_info,
  3209. nan_rep=nan_rep,
  3210. )
  3211. if hasattr(self, "levels"):
  3212. # TODO: get this into constructor, only for appropriate subclass
  3213. new_table.levels = self.levels
  3214. new_table.validate_min_itemsize(min_itemsize)
  3215. if validate and table_exists:
  3216. new_table.validate(self)
  3217. return new_table
  3218. @staticmethod
  3219. def _get_blocks_and_items(
  3220. block_obj, table_exists, new_non_index_axes, values_axes, data_columns
  3221. ):
  3222. # Helper to clarify non-state-altering parts of _create_axes
  3223. def get_blk_items(mgr, blocks):
  3224. return [mgr.items.take(blk.mgr_locs) for blk in blocks]
  3225. blocks = block_obj._data.blocks
  3226. blk_items = get_blk_items(block_obj._data, blocks)
  3227. if len(data_columns):
  3228. axis, axis_labels = new_non_index_axes[0]
  3229. new_labels = Index(axis_labels).difference(Index(data_columns))
  3230. mgr = block_obj.reindex(new_labels, axis=axis)._data
  3231. blocks = list(mgr.blocks)
  3232. blk_items = get_blk_items(mgr, blocks)
  3233. for c in data_columns:
  3234. mgr = block_obj.reindex([c], axis=axis)._data
  3235. blocks.extend(mgr.blocks)
  3236. blk_items.extend(get_blk_items(mgr, mgr.blocks))
  3237. # reorder the blocks in the same order as the existing table if we can
  3238. if table_exists:
  3239. by_items = {
  3240. tuple(b_items.tolist()): (b, b_items)
  3241. for b, b_items in zip(blocks, blk_items)
  3242. }
  3243. new_blocks = []
  3244. new_blk_items = []
  3245. for ea in values_axes:
  3246. items = tuple(ea.values)
  3247. try:
  3248. b, b_items = by_items.pop(items)
  3249. new_blocks.append(b)
  3250. new_blk_items.append(b_items)
  3251. except (IndexError, KeyError):
  3252. jitems = ",".join(pprint_thing(item) for item in items)
  3253. raise ValueError(
  3254. f"cannot match existing table structure for [{jitems}] "
  3255. "on appending data"
  3256. )
  3257. blocks = new_blocks
  3258. blk_items = new_blk_items
  3259. return blocks, blk_items
  3260. def process_axes(self, obj, selection: "Selection", columns=None):
  3261. """ process axes filters """
  3262. # make a copy to avoid side effects
  3263. if columns is not None:
  3264. columns = list(columns)
  3265. # make sure to include levels if we have them
  3266. if columns is not None and self.is_multi_index:
  3267. assert isinstance(self.levels, list) # assured by is_multi_index
  3268. for n in self.levels:
  3269. if n not in columns:
  3270. columns.insert(0, n)
  3271. # reorder by any non_index_axes & limit to the select columns
  3272. for axis, labels in self.non_index_axes:
  3273. obj = _reindex_axis(obj, axis, labels, columns)
  3274. # apply the selection filters (but keep in the same order)
  3275. if selection.filter is not None:
  3276. for field, op, filt in selection.filter.format():
  3277. def process_filter(field, filt):
  3278. for axis_name in obj._AXIS_NAMES.values():
  3279. axis_number = obj._get_axis_number(axis_name)
  3280. axis_values = obj._get_axis(axis_name)
  3281. assert axis_number is not None
  3282. # see if the field is the name of an axis
  3283. if field == axis_name:
  3284. # if we have a multi-index, then need to include
  3285. # the levels
  3286. if self.is_multi_index:
  3287. filt = filt.union(Index(self.levels))
  3288. takers = op(axis_values, filt)
  3289. return obj.loc(axis=axis_number)[takers]
  3290. # this might be the name of a file IN an axis
  3291. elif field in axis_values:
  3292. # we need to filter on this dimension
  3293. values = ensure_index(getattr(obj, field).values)
  3294. filt = ensure_index(filt)
  3295. # hack until we support reversed dim flags
  3296. if isinstance(obj, DataFrame):
  3297. axis_number = 1 - axis_number
  3298. takers = op(values, filt)
  3299. return obj.loc(axis=axis_number)[takers]
  3300. raise ValueError(f"cannot find the field [{field}] for filtering!")
  3301. obj = process_filter(field, filt)
  3302. return obj
  3303. def create_description(
  3304. self,
  3305. complib,
  3306. complevel: Optional[int],
  3307. fletcher32: bool,
  3308. expectedrows: Optional[int],
  3309. ) -> Dict[str, Any]:
  3310. """ create the description of the table from the axes & values """
  3311. # provided expected rows if its passed
  3312. if expectedrows is None:
  3313. expectedrows = max(self.nrows_expected, 10000)
  3314. d = dict(name="table", expectedrows=expectedrows)
  3315. # description from the axes & values
  3316. d["description"] = {a.cname: a.typ for a in self.axes}
  3317. if complib:
  3318. if complevel is None:
  3319. complevel = self._complevel or 9
  3320. filters = _tables().Filters(
  3321. complevel=complevel,
  3322. complib=complib,
  3323. fletcher32=fletcher32 or self._fletcher32,
  3324. )
  3325. d["filters"] = filters
  3326. elif self._filters is not None:
  3327. d["filters"] = self._filters
  3328. return d
  3329. def read_coordinates(
  3330. self, where=None, start: Optional[int] = None, stop: Optional[int] = None,
  3331. ):
  3332. """select coordinates (row numbers) from a table; return the
  3333. coordinates object
  3334. """
  3335. # validate the version
  3336. self.validate_version(where)
  3337. # infer the data kind
  3338. if not self.infer_axes():
  3339. return False
  3340. # create the selection
  3341. selection = Selection(self, where=where, start=start, stop=stop)
  3342. coords = selection.select_coords()
  3343. if selection.filter is not None:
  3344. for field, op, filt in selection.filter.format():
  3345. data = self.read_column(
  3346. field, start=coords.min(), stop=coords.max() + 1
  3347. )
  3348. coords = coords[op(data.iloc[coords - coords.min()], filt).values]
  3349. return Index(coords)
  3350. def read_column(
  3351. self,
  3352. column: str,
  3353. where=None,
  3354. start: Optional[int] = None,
  3355. stop: Optional[int] = None,
  3356. ):
  3357. """return a single column from the table, generally only indexables
  3358. are interesting
  3359. """
  3360. # validate the version
  3361. self.validate_version()
  3362. # infer the data kind
  3363. if not self.infer_axes():
  3364. return False
  3365. if where is not None:
  3366. raise TypeError("read_column does not currently accept a where clause")
  3367. # find the axes
  3368. for a in self.axes:
  3369. if column == a.name:
  3370. if not a.is_data_indexable:
  3371. raise ValueError(
  3372. f"column [{column}] can not be extracted individually; "
  3373. "it is not data indexable"
  3374. )
  3375. # column must be an indexable or a data column
  3376. c = getattr(self.table.cols, column)
  3377. a.set_info(self.info)
  3378. col_values = a.convert(
  3379. c[start:stop],
  3380. nan_rep=self.nan_rep,
  3381. encoding=self.encoding,
  3382. errors=self.errors,
  3383. )
  3384. return Series(_set_tz(col_values[1], a.tz), name=column)
  3385. raise KeyError(f"column [{column}] not found in the table")
  3386. class WORMTable(Table):
  3387. """ a write-once read-many table: this format DOES NOT ALLOW appending to a
  3388. table. writing is a one-time operation the data are stored in a format
  3389. that allows for searching the data on disk
  3390. """
  3391. table_type = "worm"
  3392. def read(
  3393. self,
  3394. where=None,
  3395. columns=None,
  3396. start: Optional[int] = None,
  3397. stop: Optional[int] = None,
  3398. ):
  3399. """ read the indices and the indexing array, calculate offset rows and
  3400. return """
  3401. raise NotImplementedError("WORMTable needs to implement read")
  3402. def write(self, **kwargs):
  3403. """ write in a format that we can search later on (but cannot append
  3404. to): write out the indices and the values using _write_array
  3405. (e.g. a CArray) create an indexing table so that we can search
  3406. """
  3407. raise NotImplementedError("WORMTable needs to implement write")
  3408. class AppendableTable(Table):
  3409. """ support the new appendable table formats """
  3410. table_type = "appendable"
  3411. def write(
  3412. self,
  3413. obj,
  3414. axes=None,
  3415. append=False,
  3416. complib=None,
  3417. complevel=None,
  3418. fletcher32=None,
  3419. min_itemsize=None,
  3420. chunksize=None,
  3421. expectedrows=None,
  3422. dropna=False,
  3423. nan_rep=None,
  3424. data_columns=None,
  3425. ):
  3426. if not append and self.is_exists:
  3427. self._handle.remove_node(self.group, "table")
  3428. # create the axes
  3429. table = self._create_axes(
  3430. axes=axes,
  3431. obj=obj,
  3432. validate=append,
  3433. min_itemsize=min_itemsize,
  3434. nan_rep=nan_rep,
  3435. data_columns=data_columns,
  3436. )
  3437. for a in table.axes:
  3438. a.validate_names()
  3439. if not table.is_exists:
  3440. # create the table
  3441. options = table.create_description(
  3442. complib=complib,
  3443. complevel=complevel,
  3444. fletcher32=fletcher32,
  3445. expectedrows=expectedrows,
  3446. )
  3447. # set the table attributes
  3448. table.set_attrs()
  3449. # create the table
  3450. table._handle.create_table(table.group, **options)
  3451. # update my info
  3452. table.attrs.info = table.info
  3453. # validate the axes and set the kinds
  3454. for a in table.axes:
  3455. a.validate_and_set(table, append)
  3456. # add the rows
  3457. table.write_data(chunksize, dropna=dropna)
  3458. def write_data(self, chunksize: Optional[int], dropna: bool = False):
  3459. """ we form the data into a 2-d including indexes,values,mask
  3460. write chunk-by-chunk """
  3461. names = self.dtype.names
  3462. nrows = self.nrows_expected
  3463. # if dropna==True, then drop ALL nan rows
  3464. masks = []
  3465. if dropna:
  3466. for a in self.values_axes:
  3467. # figure the mask: only do if we can successfully process this
  3468. # column, otherwise ignore the mask
  3469. mask = isna(a.data).all(axis=0)
  3470. if isinstance(mask, np.ndarray):
  3471. masks.append(mask.astype("u1", copy=False))
  3472. # consolidate masks
  3473. if len(masks):
  3474. mask = masks[0]
  3475. for m in masks[1:]:
  3476. mask = mask & m
  3477. mask = mask.ravel()
  3478. else:
  3479. mask = None
  3480. # broadcast the indexes if needed
  3481. indexes = [a.cvalues for a in self.index_axes]
  3482. nindexes = len(indexes)
  3483. assert nindexes == 1, nindexes # ensures we dont need to broadcast
  3484. # transpose the values so first dimension is last
  3485. # reshape the values if needed
  3486. values = [a.take_data() for a in self.values_axes]
  3487. values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
  3488. bvalues = []
  3489. for i, v in enumerate(values):
  3490. new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
  3491. bvalues.append(values[i].reshape(new_shape))
  3492. # write the chunks
  3493. if chunksize is None:
  3494. chunksize = 100000
  3495. rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
  3496. chunks = int(nrows / chunksize) + 1
  3497. for i in range(chunks):
  3498. start_i = i * chunksize
  3499. end_i = min((i + 1) * chunksize, nrows)
  3500. if start_i >= end_i:
  3501. break
  3502. self.write_data_chunk(
  3503. rows,
  3504. indexes=[a[start_i:end_i] for a in indexes],
  3505. mask=mask[start_i:end_i] if mask is not None else None,
  3506. values=[v[start_i:end_i] for v in bvalues],
  3507. )
  3508. def write_data_chunk(
  3509. self,
  3510. rows: np.ndarray,
  3511. indexes: List[np.ndarray],
  3512. mask: Optional[np.ndarray],
  3513. values: List[np.ndarray],
  3514. ):
  3515. """
  3516. Parameters
  3517. ----------
  3518. rows : an empty memory space where we are putting the chunk
  3519. indexes : an array of the indexes
  3520. mask : an array of the masks
  3521. values : an array of the values
  3522. """
  3523. # 0 len
  3524. for v in values:
  3525. if not np.prod(v.shape):
  3526. return
  3527. nrows = indexes[0].shape[0]
  3528. if nrows != len(rows):
  3529. rows = np.empty(nrows, dtype=self.dtype)
  3530. names = self.dtype.names
  3531. nindexes = len(indexes)
  3532. # indexes
  3533. for i, idx in enumerate(indexes):
  3534. rows[names[i]] = idx
  3535. # values
  3536. for i, v in enumerate(values):
  3537. rows[names[i + nindexes]] = v
  3538. # mask
  3539. if mask is not None:
  3540. m = ~mask.ravel().astype(bool, copy=False)
  3541. if not m.all():
  3542. rows = rows[m]
  3543. if len(rows):
  3544. self.table.append(rows)
  3545. self.table.flush()
  3546. def delete(
  3547. self, where=None, start: Optional[int] = None, stop: Optional[int] = None,
  3548. ):
  3549. # delete all rows (and return the nrows)
  3550. if where is None or not len(where):
  3551. if start is None and stop is None:
  3552. nrows = self.nrows
  3553. self._handle.remove_node(self.group, recursive=True)
  3554. else:
  3555. # pytables<3.0 would remove a single row with stop=None
  3556. if stop is None:
  3557. stop = self.nrows
  3558. nrows = self.table.remove_rows(start=start, stop=stop)
  3559. self.table.flush()
  3560. return nrows
  3561. # infer the data kind
  3562. if not self.infer_axes():
  3563. return None
  3564. # create the selection
  3565. table = self.table
  3566. selection = Selection(self, where, start=start, stop=stop)
  3567. values = selection.select_coords()
  3568. # delete the rows in reverse order
  3569. sorted_series = Series(values).sort_values()
  3570. ln = len(sorted_series)
  3571. if ln:
  3572. # construct groups of consecutive rows
  3573. diff = sorted_series.diff()
  3574. groups = list(diff[diff > 1].index)
  3575. # 1 group
  3576. if not len(groups):
  3577. groups = [0]
  3578. # final element
  3579. if groups[-1] != ln:
  3580. groups.append(ln)
  3581. # initial element
  3582. if groups[0] != 0:
  3583. groups.insert(0, 0)
  3584. # we must remove in reverse order!
  3585. pg = groups.pop()
  3586. for g in reversed(groups):
  3587. rows = sorted_series.take(range(g, pg))
  3588. table.remove_rows(
  3589. start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
  3590. )
  3591. pg = g
  3592. self.table.flush()
  3593. # return the number of rows removed
  3594. return ln
  3595. class AppendableFrameTable(AppendableTable):
  3596. """ support the new appendable table formats """
  3597. pandas_kind = "frame_table"
  3598. table_type = "appendable_frame"
  3599. ndim = 2
  3600. obj_type: Type[Union[DataFrame, Series]] = DataFrame
  3601. @property
  3602. def is_transposed(self) -> bool:
  3603. return self.index_axes[0].axis == 1
  3604. @classmethod
  3605. def get_object(cls, obj, transposed: bool):
  3606. """ these are written transposed """
  3607. if transposed:
  3608. obj = obj.T
  3609. return obj
  3610. def read(
  3611. self,
  3612. where=None,
  3613. columns=None,
  3614. start: Optional[int] = None,
  3615. stop: Optional[int] = None,
  3616. ):
  3617. # validate the version
  3618. self.validate_version(where)
  3619. # infer the data kind
  3620. if not self.infer_axes():
  3621. return None
  3622. result = self._read_axes(where=where, start=start, stop=stop)
  3623. info = (
  3624. self.info.get(self.non_index_axes[0][0], dict())
  3625. if len(self.non_index_axes)
  3626. else dict()
  3627. )
  3628. inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
  3629. assert len(inds) == 1
  3630. ind = inds[0]
  3631. index = result[ind][0]
  3632. frames = []
  3633. for i, a in enumerate(self.axes):
  3634. if a not in self.values_axes:
  3635. continue
  3636. index_vals, cvalues = result[i]
  3637. # we could have a multi-index constructor here
  3638. # ensure_index doesn't recognized our list-of-tuples here
  3639. if info.get("type") == "MultiIndex":
  3640. cols = MultiIndex.from_tuples(index_vals)
  3641. else:
  3642. cols = Index(index_vals)
  3643. names = info.get("names")
  3644. if names is not None:
  3645. cols.set_names(names, inplace=True)
  3646. if self.is_transposed:
  3647. values = cvalues
  3648. index_ = cols
  3649. cols_ = Index(index, name=getattr(index, "name", None))
  3650. else:
  3651. values = cvalues.T
  3652. index_ = Index(index, name=getattr(index, "name", None))
  3653. cols_ = cols
  3654. # if we have a DataIndexableCol, its shape will only be 1 dim
  3655. if values.ndim == 1 and isinstance(values, np.ndarray):
  3656. values = values.reshape((1, values.shape[0]))
  3657. if isinstance(values, np.ndarray):
  3658. df = DataFrame(values.T, columns=cols_, index=index_)
  3659. elif isinstance(values, Index):
  3660. df = DataFrame(values, columns=cols_, index=index_)
  3661. else:
  3662. # Categorical
  3663. df = DataFrame([values], columns=cols_, index=index_)
  3664. assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
  3665. frames.append(df)
  3666. if len(frames) == 1:
  3667. df = frames[0]
  3668. else:
  3669. df = concat(frames, axis=1)
  3670. selection = Selection(self, where=where, start=start, stop=stop)
  3671. # apply the selection filters & axis orderings
  3672. df = self.process_axes(df, selection=selection, columns=columns)
  3673. return df
  3674. class AppendableSeriesTable(AppendableFrameTable):
  3675. """ support the new appendable table formats """
  3676. pandas_kind = "series_table"
  3677. table_type = "appendable_series"
  3678. ndim = 2
  3679. obj_type = Series
  3680. @property
  3681. def is_transposed(self) -> bool:
  3682. return False
  3683. @classmethod
  3684. def get_object(cls, obj, transposed: bool):
  3685. return obj
  3686. def write(self, obj, data_columns=None, **kwargs):
  3687. """ we are going to write this as a frame table """
  3688. if not isinstance(obj, DataFrame):
  3689. name = obj.name or "values"
  3690. obj = obj.to_frame(name)
  3691. return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
  3692. def read(
  3693. self,
  3694. where=None,
  3695. columns=None,
  3696. start: Optional[int] = None,
  3697. stop: Optional[int] = None,
  3698. ) -> Series:
  3699. is_multi_index = self.is_multi_index
  3700. if columns is not None and is_multi_index:
  3701. assert isinstance(self.levels, list) # needed for mypy
  3702. for n in self.levels:
  3703. if n not in columns:
  3704. columns.insert(0, n)
  3705. s = super().read(where=where, columns=columns, start=start, stop=stop)
  3706. if is_multi_index:
  3707. s.set_index(self.levels, inplace=True)
  3708. s = s.iloc[:, 0]
  3709. # remove the default name
  3710. if s.name == "values":
  3711. s.name = None
  3712. return s
  3713. class AppendableMultiSeriesTable(AppendableSeriesTable):
  3714. """ support the new appendable table formats """
  3715. pandas_kind = "series_table"
  3716. table_type = "appendable_multiseries"
  3717. def write(self, obj, **kwargs):
  3718. """ we are going to write this as a frame table """
  3719. name = obj.name or "values"
  3720. obj, self.levels = self.validate_multiindex(obj)
  3721. cols = list(self.levels)
  3722. cols.append(name)
  3723. obj.columns = cols
  3724. return super().write(obj=obj, **kwargs)
  3725. class GenericTable(AppendableFrameTable):
  3726. """ a table that read/writes the generic pytables table format """
  3727. pandas_kind = "frame_table"
  3728. table_type = "generic_table"
  3729. ndim = 2
  3730. obj_type = DataFrame
  3731. @property
  3732. def pandas_type(self) -> str:
  3733. return self.pandas_kind
  3734. @property
  3735. def storable(self):
  3736. return getattr(self.group, "table", None) or self.group
  3737. def get_attrs(self):
  3738. """ retrieve our attributes """
  3739. self.non_index_axes = []
  3740. self.nan_rep = None
  3741. self.levels = []
  3742. self.index_axes = [a for a in self.indexables if a.is_an_indexable]
  3743. self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
  3744. self.data_columns = [a.name for a in self.values_axes]
  3745. @cache_readonly
  3746. def indexables(self):
  3747. """ create the indexables from the table description """
  3748. d = self.description
  3749. # TODO: can we get a typ for this? AFAICT it is the only place
  3750. # where we aren't passing one
  3751. # the index columns is just a simple index
  3752. md = self.read_metadata("index")
  3753. meta = "category" if md is not None else None
  3754. index_col = GenericIndexCol(
  3755. name="index", axis=0, table=self.table, meta=meta, metadata=md
  3756. )
  3757. _indexables = [index_col]
  3758. for i, n in enumerate(d._v_names):
  3759. assert isinstance(n, str)
  3760. atom = getattr(d, n)
  3761. md = self.read_metadata(n)
  3762. meta = "category" if md is not None else None
  3763. dc = GenericDataIndexableCol(
  3764. name=n,
  3765. pos=i,
  3766. values=[n],
  3767. typ=atom,
  3768. table=self.table,
  3769. meta=meta,
  3770. metadata=md,
  3771. )
  3772. _indexables.append(dc)
  3773. return _indexables
  3774. def write(self, **kwargs):
  3775. raise NotImplementedError("cannot write on an generic table")
  3776. class AppendableMultiFrameTable(AppendableFrameTable):
  3777. """ a frame with a multi-index """
  3778. table_type = "appendable_multiframe"
  3779. obj_type = DataFrame
  3780. ndim = 2
  3781. _re_levels = re.compile(r"^level_\d+$")
  3782. @property
  3783. def table_type_short(self) -> str:
  3784. return "appendable_multi"
  3785. def write(self, obj, data_columns=None, **kwargs):
  3786. if data_columns is None:
  3787. data_columns = []
  3788. elif data_columns is True:
  3789. data_columns = obj.columns.tolist()
  3790. obj, self.levels = self.validate_multiindex(obj)
  3791. for n in self.levels:
  3792. if n not in data_columns:
  3793. data_columns.insert(0, n)
  3794. return super().write(obj=obj, data_columns=data_columns, **kwargs)
  3795. def read(
  3796. self,
  3797. where=None,
  3798. columns=None,
  3799. start: Optional[int] = None,
  3800. stop: Optional[int] = None,
  3801. ):
  3802. df = super().read(where=where, columns=columns, start=start, stop=stop)
  3803. df = df.set_index(self.levels)
  3804. # remove names for 'level_%d'
  3805. df.index = df.index.set_names(
  3806. [None if self._re_levels.search(l) else l for l in df.index.names]
  3807. )
  3808. return df
  3809. def _reindex_axis(obj: DataFrame, axis: int, labels: Index, other=None) -> DataFrame:
  3810. ax = obj._get_axis(axis)
  3811. labels = ensure_index(labels)
  3812. # try not to reindex even if other is provided
  3813. # if it equals our current index
  3814. if other is not None:
  3815. other = ensure_index(other)
  3816. if (other is None or labels.equals(other)) and labels.equals(ax):
  3817. return obj
  3818. labels = ensure_index(labels.unique())
  3819. if other is not None:
  3820. labels = ensure_index(other.unique()).intersection(labels, sort=False)
  3821. if not labels.equals(ax):
  3822. slicer: List[Union[slice, Index]] = [slice(None, None)] * obj.ndim
  3823. slicer[axis] = labels
  3824. obj = obj.loc[tuple(slicer)]
  3825. return obj
  3826. # tz to/from coercion
  3827. def _get_tz(tz: tzinfo) -> Union[str, tzinfo]:
  3828. """ for a tz-aware type, return an encoded zone """
  3829. zone = timezones.get_timezone(tz)
  3830. return zone
  3831. def _set_tz(
  3832. values: Union[np.ndarray, Index],
  3833. tz: Optional[Union[str, tzinfo]],
  3834. coerce: bool = False,
  3835. ) -> Union[np.ndarray, DatetimeIndex]:
  3836. """
  3837. coerce the values to a DatetimeIndex if tz is set
  3838. preserve the input shape if possible
  3839. Parameters
  3840. ----------
  3841. values : ndarray or Index
  3842. tz : str or tzinfo
  3843. coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
  3844. """
  3845. if isinstance(values, DatetimeIndex):
  3846. # If values is tzaware, the tz gets dropped in the values.ravel()
  3847. # call below (which returns an ndarray). So we are only non-lossy
  3848. # if `tz` matches `values.tz`.
  3849. assert values.tz is None or values.tz == tz
  3850. if tz is not None:
  3851. name = getattr(values, "name", None)
  3852. values = values.ravel()
  3853. tz = timezones.get_timezone(_ensure_decoded(tz))
  3854. values = DatetimeIndex(values, name=name)
  3855. values = values.tz_localize("UTC").tz_convert(tz)
  3856. elif coerce:
  3857. values = np.asarray(values, dtype="M8[ns]")
  3858. return values
  3859. def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
  3860. assert isinstance(name, str)
  3861. index_name = index.name
  3862. converted, dtype_name = _get_data_and_dtype_name(index)
  3863. kind = _dtype_to_kind(dtype_name)
  3864. atom = DataIndexableCol._get_atom(converted)
  3865. if isinstance(index, Int64Index):
  3866. # Includes Int64Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
  3867. # in which case "kind" is "integer", "integer", "datetime64",
  3868. # "timedelta64", and "integer", respectively.
  3869. return IndexCol(
  3870. name,
  3871. values=converted,
  3872. kind=kind,
  3873. typ=atom,
  3874. freq=getattr(index, "freq", None),
  3875. tz=getattr(index, "tz", None),
  3876. index_name=index_name,
  3877. )
  3878. if isinstance(index, MultiIndex):
  3879. raise TypeError("MultiIndex not supported here!")
  3880. inferred_type = lib.infer_dtype(index, skipna=False)
  3881. # we wont get inferred_type of "datetime64" or "timedelta64" as these
  3882. # would go through the DatetimeIndex/TimedeltaIndex paths above
  3883. values = np.asarray(index)
  3884. if inferred_type == "date":
  3885. converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
  3886. return IndexCol(
  3887. name, converted, "date", _tables().Time32Col(), index_name=index_name,
  3888. )
  3889. elif inferred_type == "string":
  3890. converted = _convert_string_array(values, encoding, errors)
  3891. itemsize = converted.dtype.itemsize
  3892. return IndexCol(
  3893. name,
  3894. converted,
  3895. "string",
  3896. _tables().StringCol(itemsize),
  3897. index_name=index_name,
  3898. )
  3899. elif inferred_type in ["integer", "floating"]:
  3900. return IndexCol(
  3901. name, values=converted, kind=kind, typ=atom, index_name=index_name,
  3902. )
  3903. else:
  3904. assert isinstance(converted, np.ndarray) and converted.dtype == object
  3905. assert kind == "object", kind
  3906. atom = _tables().ObjectAtom()
  3907. return IndexCol(name, converted, kind, atom, index_name=index_name,)
  3908. def _unconvert_index(
  3909. data, kind: str, encoding: str, errors: str
  3910. ) -> Union[np.ndarray, Index]:
  3911. index: Union[Index, np.ndarray]
  3912. if kind == "datetime64":
  3913. index = DatetimeIndex(data)
  3914. elif kind == "timedelta64":
  3915. index = TimedeltaIndex(data)
  3916. elif kind == "date":
  3917. try:
  3918. index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
  3919. except (ValueError):
  3920. index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
  3921. elif kind in ("integer", "float"):
  3922. index = np.asarray(data)
  3923. elif kind in ("string"):
  3924. index = _unconvert_string_array(
  3925. data, nan_rep=None, encoding=encoding, errors=errors
  3926. )
  3927. elif kind == "object":
  3928. index = np.asarray(data[0])
  3929. else: # pragma: no cover
  3930. raise ValueError(f"unrecognized index type {kind}")
  3931. return index
  3932. def _maybe_convert_for_string_atom(
  3933. name: str, block, existing_col, min_itemsize, nan_rep, encoding, errors
  3934. ):
  3935. if not block.is_object:
  3936. return block.values
  3937. dtype_name = block.dtype.name
  3938. inferred_type = lib.infer_dtype(block.values, skipna=False)
  3939. if inferred_type == "date":
  3940. raise TypeError("[date] is not implemented as a table column")
  3941. elif inferred_type == "datetime":
  3942. # after GH#8260
  3943. # this only would be hit for a multi-timezone dtype which is an error
  3944. raise TypeError(
  3945. "too many timezones in this block, create separate data columns"
  3946. )
  3947. elif not (inferred_type == "string" or dtype_name == "object"):
  3948. return block.values
  3949. block = block.fillna(nan_rep, downcast=False)
  3950. if isinstance(block, list):
  3951. # Note: because block is always object dtype, fillna goes
  3952. # through a path such that the result is always a 1-element list
  3953. block = block[0]
  3954. data = block.values
  3955. # see if we have a valid string type
  3956. inferred_type = lib.infer_dtype(data.ravel(), skipna=False)
  3957. if inferred_type != "string":
  3958. # we cannot serialize this data, so report an exception on a column
  3959. # by column basis
  3960. for i in range(len(block.shape[0])):
  3961. col = block.iget(i)
  3962. inferred_type = lib.infer_dtype(col.ravel(), skipna=False)
  3963. if inferred_type != "string":
  3964. iloc = block.mgr_locs.indexer[i]
  3965. raise TypeError(
  3966. f"Cannot serialize the column [{iloc}] because\n"
  3967. f"its data contents are [{inferred_type}] object dtype"
  3968. )
  3969. # itemsize is the maximum length of a string (along any dimension)
  3970. data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
  3971. assert data_converted.shape == block.shape, (data_converted.shape, block.shape)
  3972. itemsize = data_converted.itemsize
  3973. # specified min_itemsize?
  3974. if isinstance(min_itemsize, dict):
  3975. min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
  3976. itemsize = max(min_itemsize or 0, itemsize)
  3977. # check for column in the values conflicts
  3978. if existing_col is not None:
  3979. eci = existing_col.validate_col(itemsize)
  3980. if eci > itemsize:
  3981. itemsize = eci
  3982. data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
  3983. return data_converted
  3984. def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
  3985. """
  3986. Take a string-like that is object dtype and coerce to a fixed size string type.
  3987. Parameters
  3988. ----------
  3989. data : np.ndarray[object]
  3990. encoding : str
  3991. errors : str
  3992. Handler for encoding errors.
  3993. Returns
  3994. -------
  3995. np.ndarray[fixed-length-string]
  3996. """
  3997. # encode if needed
  3998. if len(data):
  3999. data = (
  4000. Series(data.ravel()).str.encode(encoding, errors).values.reshape(data.shape)
  4001. )
  4002. # create the sized dtype
  4003. ensured = ensure_object(data.ravel())
  4004. itemsize = max(1, libwriters.max_len_string_array(ensured))
  4005. data = np.asarray(data, dtype=f"S{itemsize}")
  4006. return data
  4007. def _unconvert_string_array(
  4008. data: np.ndarray, nan_rep, encoding: str, errors: str
  4009. ) -> np.ndarray:
  4010. """
  4011. Inverse of _convert_string_array.
  4012. Parameters
  4013. ----------
  4014. data : np.ndarray[fixed-length-string]
  4015. nan_rep : the storage repr of NaN
  4016. encoding : str
  4017. errors : str
  4018. Handler for encoding errors.
  4019. Returns
  4020. -------
  4021. np.ndarray[object]
  4022. Decoded data.
  4023. """
  4024. shape = data.shape
  4025. data = np.asarray(data.ravel(), dtype=object)
  4026. if len(data):
  4027. itemsize = libwriters.max_len_string_array(ensure_object(data))
  4028. dtype = f"U{itemsize}"
  4029. if isinstance(data[0], bytes):
  4030. data = Series(data).str.decode(encoding, errors=errors).values
  4031. else:
  4032. data = data.astype(dtype, copy=False).astype(object, copy=False)
  4033. if nan_rep is None:
  4034. nan_rep = "nan"
  4035. data = libwriters.string_array_replace_from_nan_rep(data, nan_rep)
  4036. return data.reshape(shape)
  4037. def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
  4038. assert isinstance(val_kind, str), type(val_kind)
  4039. if _need_convert(val_kind):
  4040. conv = _get_converter(val_kind, encoding, errors)
  4041. values = conv(values)
  4042. return values
  4043. def _get_converter(kind: str, encoding: str, errors: str):
  4044. if kind == "datetime64":
  4045. return lambda x: np.asarray(x, dtype="M8[ns]")
  4046. elif kind == "string":
  4047. return lambda x: _unconvert_string_array(
  4048. x, nan_rep=None, encoding=encoding, errors=errors
  4049. )
  4050. else: # pragma: no cover
  4051. raise ValueError(f"invalid kind {kind}")
  4052. def _need_convert(kind: str) -> bool:
  4053. if kind in ("datetime64", "string"):
  4054. return True
  4055. return False
  4056. def _maybe_adjust_name(name: str, version) -> str:
  4057. """
  4058. Prior to 0.10.1, we named values blocks like: values_block_0 an the
  4059. name values_0, adjust the given name if necessary.
  4060. Parameters
  4061. ----------
  4062. name : str
  4063. version : Tuple[int, int, int]
  4064. Returns
  4065. -------
  4066. str
  4067. """
  4068. try:
  4069. if version[0] == 0 and version[1] <= 10 and version[2] == 0:
  4070. m = re.search(r"values_block_(\d+)", name)
  4071. if m:
  4072. grp = m.groups()[0]
  4073. name = f"values_{grp}"
  4074. except IndexError:
  4075. pass
  4076. return name
  4077. def _dtype_to_kind(dtype_str: str) -> str:
  4078. """
  4079. Find the "kind" string describing the given dtype name.
  4080. """
  4081. dtype_str = _ensure_decoded(dtype_str)
  4082. if dtype_str.startswith("string") or dtype_str.startswith("bytes"):
  4083. kind = "string"
  4084. elif dtype_str.startswith("float"):
  4085. kind = "float"
  4086. elif dtype_str.startswith("complex"):
  4087. kind = "complex"
  4088. elif dtype_str.startswith("int") or dtype_str.startswith("uint"):
  4089. kind = "integer"
  4090. elif dtype_str.startswith("datetime64"):
  4091. kind = "datetime64"
  4092. elif dtype_str.startswith("timedelta"):
  4093. kind = "timedelta64"
  4094. elif dtype_str.startswith("bool"):
  4095. kind = "bool"
  4096. elif dtype_str.startswith("category"):
  4097. kind = "category"
  4098. elif dtype_str.startswith("period"):
  4099. # We store the `freq` attr so we can restore from integers
  4100. kind = "integer"
  4101. elif dtype_str == "object":
  4102. kind = "object"
  4103. else:
  4104. raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
  4105. return kind
  4106. def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]):
  4107. """
  4108. Convert the passed data into a storable form and a dtype string.
  4109. """
  4110. if is_categorical_dtype(data.dtype):
  4111. data = data.codes
  4112. # For datetime64tz we need to drop the TZ in tests TODO: why?
  4113. dtype_name = data.dtype.name.split("[")[0]
  4114. if data.dtype.kind in ["m", "M"]:
  4115. data = np.asarray(data.view("i8"))
  4116. # TODO: we used to reshape for the dt64tz case, but no longer
  4117. # doing that doesn't seem to break anything. why?
  4118. elif isinstance(data, PeriodIndex):
  4119. data = data.asi8
  4120. data = np.asarray(data)
  4121. return data, dtype_name
  4122. class Selection:
  4123. """
  4124. Carries out a selection operation on a tables.Table object.
  4125. Parameters
  4126. ----------
  4127. table : a Table object
  4128. where : list of Terms (or convertible to)
  4129. start, stop: indices to start and/or stop selection
  4130. """
  4131. def __init__(
  4132. self,
  4133. table: Table,
  4134. where=None,
  4135. start: Optional[int] = None,
  4136. stop: Optional[int] = None,
  4137. ):
  4138. self.table = table
  4139. self.where = where
  4140. self.start = start
  4141. self.stop = stop
  4142. self.condition = None
  4143. self.filter = None
  4144. self.terms = None
  4145. self.coordinates = None
  4146. if is_list_like(where):
  4147. # see if we have a passed coordinate like
  4148. try:
  4149. inferred = lib.infer_dtype(where, skipna=False)
  4150. if inferred == "integer" or inferred == "boolean":
  4151. where = np.asarray(where)
  4152. if where.dtype == np.bool_:
  4153. start, stop = self.start, self.stop
  4154. if start is None:
  4155. start = 0
  4156. if stop is None:
  4157. stop = self.table.nrows
  4158. self.coordinates = np.arange(start, stop)[where]
  4159. elif issubclass(where.dtype.type, np.integer):
  4160. if (self.start is not None and (where < self.start).any()) or (
  4161. self.stop is not None and (where >= self.stop).any()
  4162. ):
  4163. raise ValueError(
  4164. "where must have index locations >= start and < stop"
  4165. )
  4166. self.coordinates = where
  4167. except ValueError:
  4168. pass
  4169. if self.coordinates is None:
  4170. self.terms = self.generate(where)
  4171. # create the numexpr & the filter
  4172. if self.terms is not None:
  4173. self.condition, self.filter = self.terms.evaluate()
  4174. def generate(self, where):
  4175. """ where can be a : dict,list,tuple,string """
  4176. if where is None:
  4177. return None
  4178. q = self.table.queryables()
  4179. try:
  4180. return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
  4181. except NameError:
  4182. # raise a nice message, suggesting that the user should use
  4183. # data_columns
  4184. qkeys = ",".join(q.keys())
  4185. raise ValueError(
  4186. f"The passed where expression: {where}\n"
  4187. " contains an invalid variable reference\n"
  4188. " all of the variable references must be a "
  4189. "reference to\n"
  4190. " an axis (e.g. 'index' or 'columns'), or a "
  4191. "data_column\n"
  4192. f" The currently defined references are: {qkeys}\n"
  4193. )
  4194. def select(self):
  4195. """
  4196. generate the selection
  4197. """
  4198. if self.condition is not None:
  4199. return self.table.table.read_where(
  4200. self.condition.format(), start=self.start, stop=self.stop
  4201. )
  4202. elif self.coordinates is not None:
  4203. return self.table.table.read_coordinates(self.coordinates)
  4204. return self.table.table.read(start=self.start, stop=self.stop)
  4205. def select_coords(self):
  4206. """
  4207. generate the selection
  4208. """
  4209. start, stop = self.start, self.stop
  4210. nrows = self.table.nrows
  4211. if start is None:
  4212. start = 0
  4213. elif start < 0:
  4214. start += nrows
  4215. if self.stop is None:
  4216. stop = nrows
  4217. elif stop < 0:
  4218. stop += nrows
  4219. if self.condition is not None:
  4220. return self.table.table.get_where_list(
  4221. self.condition.format(), start=start, stop=stop, sort=True
  4222. )
  4223. elif self.coordinates is not None:
  4224. return self.coordinates
  4225. return np.arange(start, stop)