12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474 |
- """
- DataFrame
- ---------
- An efficient 2D container for potentially mixed-type time series or other
- labeled data series.
- Similar to its R counterpart, data.frame, except providing automatic data
- alignment and a host of useful data manipulation methods having to do with the
- labeling information
- """
- import collections
- from collections import abc
- from io import StringIO
- import itertools
- import sys
- from textwrap import dedent
- from typing import (
- IO,
- TYPE_CHECKING,
- Any,
- FrozenSet,
- Hashable,
- Iterable,
- List,
- Optional,
- Sequence,
- Set,
- Tuple,
- Type,
- Union,
- cast,
- )
- import warnings
- import numpy as np
- import numpy.ma as ma
- from pandas._config import get_option
- from pandas._libs import algos as libalgos, lib
- from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer
- from pandas.compat import PY37
- from pandas.compat._optional import import_optional_dependency
- from pandas.compat.numpy import function as nv
- from pandas.util._decorators import (
- Appender,
- Substitution,
- deprecate_kwarg,
- rewrite_axis_style_signature,
- )
- from pandas.util._validators import (
- validate_axis_style_args,
- validate_bool_kwarg,
- validate_percentile,
- )
- from pandas.core.dtypes.cast import (
- cast_scalar_to_array,
- coerce_to_dtypes,
- find_common_type,
- infer_dtype_from_scalar,
- invalidate_string_dtypes,
- maybe_cast_to_datetime,
- maybe_convert_platform,
- maybe_downcast_to_dtype,
- maybe_infer_to_datetimelike,
- maybe_upcast,
- maybe_upcast_putmask,
- )
- from pandas.core.dtypes.common import (
- ensure_float64,
- ensure_int64,
- ensure_platform_int,
- infer_dtype_from_object,
- is_bool_dtype,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float_dtype,
- is_hashable,
- is_integer,
- is_integer_dtype,
- is_iterator,
- is_list_like,
- is_named_tuple,
- is_object_dtype,
- is_scalar,
- is_sequence,
- needs_i8_conversion,
- )
- from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCIndexClass,
- ABCMultiIndex,
- ABCSeries,
- )
- from pandas.core.dtypes.missing import isna, notna
- from pandas.core import algorithms, common as com, nanops, ops
- from pandas.core.accessor import CachedAccessor
- from pandas.core.arrays import Categorical, ExtensionArray
- from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
- from pandas.core.arrays.sparse import SparseFrameAccessor
- from pandas.core.generic import NDFrame, _shared_docs
- from pandas.core.groupby import generic as groupby_generic
- from pandas.core.indexes import base as ibase
- from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
- from pandas.core.indexes.datetimes import DatetimeIndex
- from pandas.core.indexes.multi import maybe_droplevels
- from pandas.core.indexes.period import PeriodIndex
- from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
- from pandas.core.internals import BlockManager
- from pandas.core.internals.construction import (
- arrays_to_mgr,
- get_names_from_index,
- init_dict,
- init_ndarray,
- masked_rec_array_to_mgr,
- reorder_arrays,
- sanitize_index,
- to_arrays,
- )
- from pandas.core.ops.missing import dispatch_fill_zeros
- from pandas.core.series import Series
- from pandas.io.common import get_filepath_or_buffer
- from pandas.io.formats import console, format as fmt
- from pandas.io.formats.printing import pprint_thing
- import pandas.plotting
- if TYPE_CHECKING:
- from pandas.io.formats.style import Styler
- # ---------------------------------------------------------------------
- # Docstring templates
- _shared_doc_kwargs = dict(
- axes="index, columns",
- klass="DataFrame",
- axes_single_arg="{0 or 'index', 1 or 'columns'}",
- axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
- If 0 or 'index': apply function to each column.
- If 1 or 'columns': apply function to each row.""",
- optional_by="""
- by : str or list of str
- Name or list of names to sort by.
- - if `axis` is 0 or `'index'` then `by` may contain index
- levels and/or column labels.
- - if `axis` is 1 or `'columns'` then `by` may contain column
- levels and/or index labels.
- .. versionchanged:: 0.23.0
- Allow specifying index or column level names.""",
- versionadded_to_excel="",
- optional_labels="""labels : array-like, optional
- New labels / index to conform the axis specified by 'axis' to.""",
- optional_axis="""axis : int or str, optional
- Axis to target. Can be either the axis name ('index', 'columns')
- or number (0, 1).""",
- )
- _numeric_only_doc = """numeric_only : boolean, default None
- Include only float, int, boolean data. If None, will attempt to use
- everything, then use only numeric data
- """
- _merge_doc = """
- Merge DataFrame or named Series objects with a database-style join.
- The join is done on columns or indexes. If joining columns on
- columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
- on indexes or indexes on a column or columns, the index will be passed on.
- Parameters
- ----------%s
- right : DataFrame or named Series
- Object to merge with.
- how : {'left', 'right', 'outer', 'inner'}, default 'inner'
- Type of merge to be performed.
- * left: use only keys from left frame, similar to a SQL left outer join;
- preserve key order.
- * right: use only keys from right frame, similar to a SQL right outer join;
- preserve key order.
- * outer: use union of keys from both frames, similar to a SQL full outer
- join; sort keys lexicographically.
- * inner: use intersection of keys from both frames, similar to a SQL inner
- join; preserve the order of the left keys.
- on : label or list
- Column or index level names to join on. These must be found in both
- DataFrames. If `on` is None and not merging on indexes then this defaults
- to the intersection of the columns in both DataFrames.
- left_on : label or list, or array-like
- Column or index level names to join on in the left DataFrame. Can also
- be an array or list of arrays of the length of the left DataFrame.
- These arrays are treated as if they are columns.
- right_on : label or list, or array-like
- Column or index level names to join on in the right DataFrame. Can also
- be an array or list of arrays of the length of the right DataFrame.
- These arrays are treated as if they are columns.
- left_index : bool, default False
- Use the index from the left DataFrame as the join key(s). If it is a
- MultiIndex, the number of keys in the other DataFrame (either the index
- or a number of columns) must match the number of levels.
- right_index : bool, default False
- Use the index from the right DataFrame as the join key. Same caveats as
- left_index.
- sort : bool, default False
- Sort the join keys lexicographically in the result DataFrame. If False,
- the order of the join keys depends on the join type (how keyword).
- suffixes : tuple of (str, str), default ('_x', '_y')
- Suffix to apply to overlapping column names in the left and right
- side, respectively. To raise an exception on overlapping columns use
- (False, False).
- copy : bool, default True
- If False, avoid copy if possible.
- indicator : bool or str, default False
- If True, adds a column to output DataFrame called "_merge" with
- information on the source of each row.
- If string, column with information on source of each row will be added to
- output DataFrame, and column will be named value of string.
- Information column is Categorical-type and takes on a value of "left_only"
- for observations whose merge key only appears in 'left' DataFrame,
- "right_only" for observations whose merge key only appears in 'right'
- DataFrame, and "both" if the observation's merge key is found in both.
- validate : str, optional
- If specified, checks if merge is of specified type.
- * "one_to_one" or "1:1": check if merge keys are unique in both
- left and right datasets.
- * "one_to_many" or "1:m": check if merge keys are unique in left
- dataset.
- * "many_to_one" or "m:1": check if merge keys are unique in right
- dataset.
- * "many_to_many" or "m:m": allowed, but does not result in checks.
- .. versionadded:: 0.21.0
- Returns
- -------
- DataFrame
- A DataFrame of the two merged objects.
- See Also
- --------
- merge_ordered : Merge with optional filling/interpolation.
- merge_asof : Merge on nearest keys.
- DataFrame.join : Similar method using indices.
- Notes
- -----
- Support for specifying index levels as the `on`, `left_on`, and
- `right_on` parameters was added in version 0.23.0
- Support for merging named Series objects was added in version 0.24.0
- Examples
- --------
- >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
- ... 'value': [1, 2, 3, 5]})
- >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
- ... 'value': [5, 6, 7, 8]})
- >>> df1
- lkey value
- 0 foo 1
- 1 bar 2
- 2 baz 3
- 3 foo 5
- >>> df2
- rkey value
- 0 foo 5
- 1 bar 6
- 2 baz 7
- 3 foo 8
- Merge df1 and df2 on the lkey and rkey columns. The value columns have
- the default suffixes, _x and _y, appended.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey')
- lkey value_x rkey value_y
- 0 foo 1 foo 5
- 1 foo 1 foo 8
- 2 foo 5 foo 5
- 3 foo 5 foo 8
- 4 bar 2 bar 6
- 5 baz 3 baz 7
- Merge DataFrames df1 and df2 with specified left and right suffixes
- appended to any overlapping columns.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey',
- ... suffixes=('_left', '_right'))
- lkey value_left rkey value_right
- 0 foo 1 foo 5
- 1 foo 1 foo 8
- 2 foo 5 foo 5
- 3 foo 5 foo 8
- 4 bar 2 bar 6
- 5 baz 3 baz 7
- Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
- any overlapping columns.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
- Traceback (most recent call last):
- ...
- ValueError: columns overlap but no suffix specified:
- Index(['value'], dtype='object')
- """
- # -----------------------------------------------------------------------
- # DataFrame class
- class DataFrame(NDFrame):
- """
- Two-dimensional, size-mutable, potentially heterogeneous tabular data.
- Data structure also contains labeled axes (rows and columns).
- Arithmetic operations align on both row and column labels. Can be
- thought of as a dict-like container for Series objects. The primary
- pandas data structure.
- Parameters
- ----------
- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
- Dict can contain Series, arrays, constants, or list-like objects.
- .. versionchanged:: 0.23.0
- If data is a dict, column order follows insertion-order for
- Python 3.6 and later.
- .. versionchanged:: 0.25.0
- If data is a list of dicts, column order follows insertion-order
- for Python 3.6 and later.
- index : Index or array-like
- Index to use for resulting frame. Will default to RangeIndex if
- no indexing information part of input data and no index provided.
- columns : Index or array-like
- Column labels to use for resulting frame. Will default to
- RangeIndex (0, 1, 2, ..., n) if no column labels are provided.
- dtype : dtype, default None
- Data type to force. Only a single dtype is allowed. If None, infer.
- copy : bool, default False
- Copy data from inputs. Only affects DataFrame / 2d ndarray input.
- See Also
- --------
- DataFrame.from_records : Constructor from tuples, also record arrays.
- DataFrame.from_dict : From dicts of Series, arrays, or dicts.
- read_csv
- read_table
- read_clipboard
- Examples
- --------
- Constructing DataFrame from a dictionary.
- >>> d = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df = pd.DataFrame(data=d)
- >>> df
- col1 col2
- 0 1 3
- 1 2 4
- Notice that the inferred dtype is int64.
- >>> df.dtypes
- col1 int64
- col2 int64
- dtype: object
- To enforce a single dtype:
- >>> df = pd.DataFrame(data=d, dtype=np.int8)
- >>> df.dtypes
- col1 int8
- col2 int8
- dtype: object
- Constructing DataFrame from numpy ndarray:
- >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
- ... columns=['a', 'b', 'c'])
- >>> df2
- a b c
- 0 1 2 3
- 1 4 5 6
- 2 7 8 9
- """
- _typ = "dataframe"
- @property
- def _constructor(self) -> Type["DataFrame"]:
- return DataFrame
- _constructor_sliced: Type[Series] = Series
- _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([])
- _accessors: Set[str] = {"sparse"}
- @property
- def _constructor_expanddim(self):
- raise NotImplementedError("Not supported for DataFrames!")
- # ----------------------------------------------------------------------
- # Constructors
- def __init__(
- self,
- data=None,
- index: Optional[Axes] = None,
- columns: Optional[Axes] = None,
- dtype: Optional[Dtype] = None,
- copy: bool = False,
- ):
- if data is None:
- data = {}
- if dtype is not None:
- dtype = self._validate_dtype(dtype)
- if isinstance(data, DataFrame):
- data = data._data
- if isinstance(data, BlockManager):
- mgr = self._init_mgr(
- data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
- )
- elif isinstance(data, dict):
- mgr = init_dict(data, index, columns, dtype=dtype)
- elif isinstance(data, ma.MaskedArray):
- import numpy.ma.mrecords as mrecords
- # masked recarray
- if isinstance(data, mrecords.MaskedRecords):
- mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
- # a masked array
- else:
- mask = ma.getmaskarray(data)
- if mask.any():
- data, fill_value = maybe_upcast(data, copy=True)
- data.soften_mask() # set hardmask False if it was True
- data[mask] = fill_value
- else:
- data = data.copy()
- mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
- elif isinstance(data, (np.ndarray, Series, Index)):
- if data.dtype.names:
- data_columns = list(data.dtype.names)
- data = {k: data[k] for k in data_columns}
- if columns is None:
- columns = data_columns
- mgr = init_dict(data, index, columns, dtype=dtype)
- elif getattr(data, "name", None) is not None:
- mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
- else:
- mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
- # For data is list-like, or Iterable (will consume into list)
- elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
- if not isinstance(data, (abc.Sequence, ExtensionArray)):
- data = list(data)
- if len(data) > 0:
- if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
- if is_named_tuple(data[0]) and columns is None:
- columns = data[0]._fields
- arrays, columns = to_arrays(data, columns, dtype=dtype)
- columns = ensure_index(columns)
- # set the index
- if index is None:
- if isinstance(data[0], Series):
- index = get_names_from_index(data)
- elif isinstance(data[0], Categorical):
- index = ibase.default_index(len(data[0]))
- else:
- index = ibase.default_index(len(data))
- mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
- else:
- mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
- else:
- mgr = init_dict({}, index, columns, dtype=dtype)
- else:
- try:
- arr = np.array(data, dtype=dtype, copy=copy)
- except (ValueError, TypeError) as e:
- exc = TypeError(
- "DataFrame constructor called with "
- f"incompatible data and dtype: {e}"
- )
- raise exc from e
- if arr.ndim == 0 and index is not None and columns is not None:
- values = cast_scalar_to_array(
- (len(index), len(columns)), data, dtype=dtype
- )
- mgr = init_ndarray(
- values, index, columns, dtype=values.dtype, copy=False
- )
- else:
- raise ValueError("DataFrame constructor not properly called!")
- NDFrame.__init__(self, mgr, fastpath=True)
- # ----------------------------------------------------------------------
- @property
- def axes(self) -> List[Index]:
- """
- Return a list representing the axes of the DataFrame.
- It has the row axis labels and column axis labels as the only members.
- They are returned in that order.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.axes
- [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
- dtype='object')]
- """
- return [self.index, self.columns]
- @property
- def shape(self) -> Tuple[int, int]:
- """
- Return a tuple representing the dimensionality of the DataFrame.
- See Also
- --------
- ndarray.shape
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.shape
- (2, 2)
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
- ... 'col3': [5, 6]})
- >>> df.shape
- (2, 3)
- """
- return len(self.index), len(self.columns)
- @property
- def _is_homogeneous_type(self) -> bool:
- """
- Whether all the columns in a DataFrame have the same type.
- Returns
- -------
- bool
- See Also
- --------
- Index._is_homogeneous_type : Whether the object has a single
- dtype.
- MultiIndex._is_homogeneous_type : Whether all the levels of a
- MultiIndex have the same dtype.
- Examples
- --------
- >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
- True
- >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
- False
- Items with the same type but different sizes are considered
- different types.
- >>> DataFrame({
- ... "A": np.array([1, 2], dtype=np.int32),
- ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
- False
- """
- if self._data.any_extension_types:
- return len({block.dtype for block in self._data.blocks}) == 1
- else:
- return not self._data.is_mixed_type
- # ----------------------------------------------------------------------
- # Rendering Methods
- def _repr_fits_vertical_(self) -> bool:
- """
- Check length against max_rows.
- """
- max_rows = get_option("display.max_rows")
- return len(self) <= max_rows
- def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
- """
- Check if full repr fits in horizontal boundaries imposed by the display
- options width and max_columns.
- In case off non-interactive session, no boundaries apply.
- `ignore_width` is here so ipnb+HTML output can behave the way
- users expect. display.max_columns remains in effect.
- GH3541, GH3573
- """
- width, height = console.get_console_size()
- max_columns = get_option("display.max_columns")
- nb_columns = len(self.columns)
- # exceed max columns
- if (max_columns and nb_columns > max_columns) or (
- (not ignore_width) and width and nb_columns > (width // 2)
- ):
- return False
- # used by repr_html under IPython notebook or scripts ignore terminal
- # dims
- if ignore_width or not console.in_interactive_session():
- return True
- if get_option("display.width") is not None or console.in_ipython_frontend():
- # check at least the column row for excessive width
- max_rows = 1
- else:
- max_rows = get_option("display.max_rows")
- # when auto-detecting, so width=None and not in ipython front end
- # check whether repr fits horizontal by actually checking
- # the width of the rendered repr
- buf = StringIO()
- # only care about the stuff we'll actually print out
- # and to_string on entire frame may be expensive
- d = self
- if not (max_rows is None): # unlimited rows
- # min of two, where one may be None
- d = d.iloc[: min(max_rows, len(d))]
- else:
- return True
- d.to_string(buf=buf)
- value = buf.getvalue()
- repr_width = max(len(l) for l in value.split("\n"))
- return repr_width < width
- def _info_repr(self) -> bool:
- """
- True if the repr should show the info view.
- """
- info_repr_option = get_option("display.large_repr") == "info"
- return info_repr_option and not (
- self._repr_fits_horizontal_() and self._repr_fits_vertical_()
- )
- def __repr__(self) -> str:
- """
- Return a string representation for a particular DataFrame.
- """
- buf = StringIO("")
- if self._info_repr():
- self.info(buf=buf)
- return buf.getvalue()
- max_rows = get_option("display.max_rows")
- min_rows = get_option("display.min_rows")
- max_cols = get_option("display.max_columns")
- max_colwidth = get_option("display.max_colwidth")
- show_dimensions = get_option("display.show_dimensions")
- if get_option("display.expand_frame_repr"):
- width, _ = console.get_console_size()
- else:
- width = None
- self.to_string(
- buf=buf,
- max_rows=max_rows,
- min_rows=min_rows,
- max_cols=max_cols,
- line_width=width,
- max_colwidth=max_colwidth,
- show_dimensions=show_dimensions,
- )
- return buf.getvalue()
- def _repr_html_(self) -> Optional[str]:
- """
- Return a html representation for a particular DataFrame.
- Mainly for IPython notebook.
- """
- if self._info_repr():
- buf = StringIO("")
- self.info(buf=buf)
- # need to escape the <class>, should be the first line.
- val = buf.getvalue().replace("<", r"<", 1)
- val = val.replace(">", r">", 1)
- return "<pre>" + val + "</pre>"
- if get_option("display.notebook_repr_html"):
- max_rows = get_option("display.max_rows")
- min_rows = get_option("display.min_rows")
- max_cols = get_option("display.max_columns")
- show_dimensions = get_option("display.show_dimensions")
- formatter = fmt.DataFrameFormatter(
- self,
- columns=None,
- col_space=None,
- na_rep="NaN",
- formatters=None,
- float_format=None,
- sparsify=None,
- justify=None,
- index_names=True,
- header=True,
- index=True,
- bold_rows=True,
- escape=True,
- max_rows=max_rows,
- min_rows=min_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=".",
- table_id=None,
- render_links=False,
- )
- return formatter.to_html(notebook=True)
- else:
- return None
- @Substitution(
- header_type="bool or sequence",
- header="Write out the column names. If a list of strings "
- "is given, it is assumed to be aliases for the "
- "column names",
- col_space_type="int",
- col_space="The minimum width of each column",
- )
- @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
- def to_string(
- self,
- buf: Optional[FilePathOrBuffer[str]] = None,
- columns: Optional[Sequence[str]] = None,
- col_space: Optional[int] = None,
- header: Union[bool, Sequence[str]] = True,
- index: bool = True,
- na_rep: str = "NaN",
- formatters: Optional[fmt.formatters_type] = None,
- float_format: Optional[fmt.float_format_type] = None,
- sparsify: Optional[bool] = None,
- index_names: bool = True,
- justify: Optional[str] = None,
- max_rows: Optional[int] = None,
- min_rows: Optional[int] = None,
- max_cols: Optional[int] = None,
- show_dimensions: bool = False,
- decimal: str = ".",
- line_width: Optional[int] = None,
- max_colwidth: Optional[int] = None,
- encoding: Optional[str] = None,
- ) -> Optional[str]:
- """
- Render a DataFrame to a console-friendly tabular output.
- %(shared_params)s
- line_width : int, optional
- Width to wrap a line in characters.
- max_colwidth : int, optional
- Max width to truncate each column in characters. By default, no limit.
- .. versionadded:: 1.0.0
- encoding : str, default "utf-8"
- Set character encoding.
- .. versionadded:: 1.0
- %(returns)s
- See Also
- --------
- to_html : Convert DataFrame to HTML.
- Examples
- --------
- >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
- >>> df = pd.DataFrame(d)
- >>> print(df.to_string())
- col1 col2
- 0 1 4
- 1 2 5
- 2 3 6
- """
- from pandas import option_context
- with option_context("display.max_colwidth", max_colwidth):
- formatter = fmt.DataFrameFormatter(
- self,
- columns=columns,
- col_space=col_space,
- na_rep=na_rep,
- formatters=formatters,
- float_format=float_format,
- sparsify=sparsify,
- justify=justify,
- index_names=index_names,
- header=header,
- index=index,
- min_rows=min_rows,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=decimal,
- line_width=line_width,
- )
- return formatter.to_string(buf=buf, encoding=encoding)
- # ----------------------------------------------------------------------
- @property
- def style(self) -> "Styler":
- """
- Returns a Styler object.
- Contains methods for building a styled HTML representation of the DataFrame.
- a styled HTML representation fo the DataFrame.
- See Also
- --------
- io.formats.style.Styler
- """
- from pandas.io.formats.style import Styler
- return Styler(self)
- _shared_docs[
- "items"
- ] = r"""
- Iterate over (column name, Series) pairs.
- Iterates over the DataFrame columns, returning a tuple with
- the column name and the content as a Series.
- Yields
- ------
- label : object
- The column names for the DataFrame being iterated over.
- content : Series
- The column entries belonging to each label, as a Series.
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as
- (index, Series) pairs.
- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
- of the values.
- Examples
- --------
- >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
- ... 'population': [1864, 22000, 80000]},
- ... index=['panda', 'polar', 'koala'])
- >>> df
- species population
- panda bear 1864
- polar bear 22000
- koala marsupial 80000
- >>> for label, content in df.items():
- ... print('label:', label)
- ... print('content:', content, sep='\n')
- ...
- label: species
- content:
- panda bear
- polar bear
- koala marsupial
- Name: species, dtype: object
- label: population
- content:
- panda 1864
- polar 22000
- koala 80000
- Name: population, dtype: int64
- """
- @Appender(_shared_docs["items"])
- def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]:
- if self.columns.is_unique and hasattr(self, "_item_cache"):
- for k in self.columns:
- yield k, self._get_item_cache(k)
- else:
- for i, k in enumerate(self.columns):
- yield k, self._ixs(i, axis=1)
- @Appender(_shared_docs["items"])
- def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]:
- yield from self.items()
- def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]:
- """
- Iterate over DataFrame rows as (index, Series) pairs.
- Yields
- ------
- index : label or tuple of label
- The index of the row. A tuple for a `MultiIndex`.
- data : Series
- The data of the row as a Series.
- it : generator
- A generator that iterates over the rows of the frame.
- See Also
- --------
- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
- DataFrame.items : Iterate over (column name, Series) pairs.
- Notes
- -----
- 1. Because ``iterrows`` returns a Series for each row,
- it does **not** preserve dtypes across the rows (dtypes are
- preserved across columns for DataFrames). For example,
- >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
- >>> row = next(df.iterrows())[1]
- >>> row
- int 1.0
- float 1.5
- Name: 0, dtype: float64
- >>> print(row['int'].dtype)
- float64
- >>> print(df['int'].dtype)
- int64
- To preserve dtypes while iterating over the rows, it is better
- to use :meth:`itertuples` which returns namedtuples of the values
- and which is generally faster than ``iterrows``.
- 2. You should **never modify** something you are iterating over.
- This is not guaranteed to work in all cases. Depending on the
- data types, the iterator returns a copy and not a view, and writing
- to it will have no effect.
- """
- columns = self.columns
- klass = self._constructor_sliced
- for k, v in zip(self.index, self.values):
- s = klass(v, index=columns, name=k)
- yield k, s
- def itertuples(self, index=True, name="Pandas"):
- """
- Iterate over DataFrame rows as namedtuples.
- Parameters
- ----------
- index : bool, default True
- If True, return the index as the first element of the tuple.
- name : str or None, default "Pandas"
- The name of the returned namedtuples or None to return regular
- tuples.
- Returns
- -------
- iterator
- An object to iterate over namedtuples for each row in the
- DataFrame with the first field possibly being the index and
- following fields being the column values.
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
- pairs.
- DataFrame.items : Iterate over (column name, Series) pairs.
- Notes
- -----
- The column names will be renamed to positional names if they are
- invalid Python identifiers, repeated, or start with an underscore.
- On python versions < 3.7 regular tuples are returned for DataFrames
- with a large number of columns (>254).
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
- ... index=['dog', 'hawk'])
- >>> df
- num_legs num_wings
- dog 4 0
- hawk 2 2
- >>> for row in df.itertuples():
- ... print(row)
- ...
- Pandas(Index='dog', num_legs=4, num_wings=0)
- Pandas(Index='hawk', num_legs=2, num_wings=2)
- By setting the `index` parameter to False we can remove the index
- as the first element of the tuple:
- >>> for row in df.itertuples(index=False):
- ... print(row)
- ...
- Pandas(num_legs=4, num_wings=0)
- Pandas(num_legs=2, num_wings=2)
- With the `name` parameter set we set a custom name for the yielded
- namedtuples:
- >>> for row in df.itertuples(name='Animal'):
- ... print(row)
- ...
- Animal(Index='dog', num_legs=4, num_wings=0)
- Animal(Index='hawk', num_legs=2, num_wings=2)
- """
- arrays = []
- fields = list(self.columns)
- if index:
- arrays.append(self.index)
- fields.insert(0, "Index")
- # use integer indexing because of possible duplicate column names
- arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
- # Python versions before 3.7 support at most 255 arguments to constructors
- can_return_named_tuples = PY37 or len(self.columns) + index < 255
- if name is not None and can_return_named_tuples:
- itertuple = collections.namedtuple(name, fields, rename=True)
- return map(itertuple._make, zip(*arrays))
- # fallback to regular tuples
- return zip(*arrays)
- def __len__(self) -> int:
- """
- Returns length of info axis, but here we use the index.
- """
- return len(self.index)
- def dot(self, other):
- """
- Compute the matrix multiplication between the DataFrame and other.
- This method computes the matrix product between the DataFrame and the
- values of an other Series, DataFrame or a numpy array.
- It can also be called using ``self @ other`` in Python >= 3.5.
- Parameters
- ----------
- other : Series, DataFrame or array-like
- The other object to compute the matrix product with.
- Returns
- -------
- Series or DataFrame
- If other is a Series, return the matrix product between self and
- other as a Serie. If other is a DataFrame or a numpy.array, return
- the matrix product of self and other in a DataFrame of a np.array.
- See Also
- --------
- Series.dot: Similar method for Series.
- Notes
- -----
- The dimensions of DataFrame and other must be compatible in order to
- compute the matrix multiplication. In addition, the column names of
- DataFrame and the index of other must contain the same values, as they
- will be aligned prior to the multiplication.
- The dot method for Series computes the inner product, instead of the
- matrix product here.
- Examples
- --------
- Here we multiply a DataFrame with a Series.
- >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
- >>> s = pd.Series([1, 1, 2, 1])
- >>> df.dot(s)
- 0 -4
- 1 5
- dtype: int64
- Here we multiply a DataFrame with another DataFrame.
- >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(other)
- 0 1
- 0 1 4
- 1 2 2
- Note that the dot method give the same result as @
- >>> df @ other
- 0 1
- 0 1 4
- 1 2 2
- The dot method works also if other is an np.array.
- >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(arr)
- 0 1
- 0 1 4
- 1 2 2
- Note how shuffling of the objects does not change the result.
- >>> s2 = s.reindex([1, 0, 2, 3])
- >>> df.dot(s2)
- 0 -4
- 1 5
- dtype: int64
- """
- if isinstance(other, (Series, DataFrame)):
- common = self.columns.union(other.index)
- if len(common) > len(self.columns) or len(common) > len(other.index):
- raise ValueError("matrices are not aligned")
- left = self.reindex(columns=common, copy=False)
- right = other.reindex(index=common, copy=False)
- lvals = left.values
- rvals = right.values
- else:
- left = self
- lvals = self.values
- rvals = np.asarray(other)
- if lvals.shape[1] != rvals.shape[0]:
- raise ValueError(
- f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
- )
- if isinstance(other, DataFrame):
- return self._constructor(
- np.dot(lvals, rvals), index=left.index, columns=other.columns
- )
- elif isinstance(other, Series):
- return Series(np.dot(lvals, rvals), index=left.index)
- elif isinstance(rvals, (np.ndarray, Index)):
- result = np.dot(lvals, rvals)
- if result.ndim == 2:
- return self._constructor(result, index=left.index)
- else:
- return Series(result, index=left.index)
- else: # pragma: no cover
- raise TypeError(f"unsupported type: {type(other)}")
- def __matmul__(self, other):
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- return self.dot(other)
- def __rmatmul__(self, other):
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- return self.T.dot(np.transpose(other)).T
- # ----------------------------------------------------------------------
- # IO methods (to / from other formats)
- @classmethod
- def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame":
- """
- Construct DataFrame from dict of array-like or dicts.
- Creates DataFrame object from dictionary by columns or by index
- allowing dtype specification.
- Parameters
- ----------
- data : dict
- Of the form {field : array-like} or {field : dict}.
- orient : {'columns', 'index'}, default 'columns'
- The "orientation" of the data. If the keys of the passed dict
- should be the columns of the resulting DataFrame, pass 'columns'
- (default). Otherwise if the keys should be rows, pass 'index'.
- dtype : dtype, default None
- Data type to force, otherwise infer.
- columns : list, default None
- Column labels to use when ``orient='index'``. Raises a ValueError
- if used with ``orient='columns'``.
- .. versionadded:: 0.23.0
- Returns
- -------
- DataFrame
- See Also
- --------
- DataFrame.from_records : DataFrame from ndarray (structured
- dtype), list of tuples, dict, or DataFrame.
- DataFrame : DataFrame object creation using constructor.
- Examples
- --------
- By default the keys of the dict become the DataFrame columns:
- >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
- >>> pd.DataFrame.from_dict(data)
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
- Specify ``orient='index'`` to create the DataFrame using dictionary
- keys as rows:
- >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
- >>> pd.DataFrame.from_dict(data, orient='index')
- 0 1 2 3
- row_1 3 2 1 0
- row_2 a b c d
- When using the 'index' orientation, the column names can be
- specified manually:
- >>> pd.DataFrame.from_dict(data, orient='index',
- ... columns=['A', 'B', 'C', 'D'])
- A B C D
- row_1 3 2 1 0
- row_2 a b c d
- """
- index = None
- orient = orient.lower()
- if orient == "index":
- if len(data) > 0:
- # TODO speed up Series case
- if isinstance(list(data.values())[0], (Series, dict)):
- data = _from_nested_dict(data)
- else:
- data, index = list(data.values()), list(data.keys())
- elif orient == "columns":
- if columns is not None:
- raise ValueError("cannot use columns parameter with orient='columns'")
- else: # pragma: no cover
- raise ValueError("only recognize index or columns for orient")
- return cls(data, index=index, columns=columns, dtype=dtype)
- def to_numpy(self, dtype=None, copy=False) -> np.ndarray:
- """
- Convert the DataFrame to a NumPy array.
- .. versionadded:: 0.24.0
- By default, the dtype of the returned array will be the common NumPy
- dtype of all types in the DataFrame. For example, if the dtypes are
- ``float16`` and ``float32``, the results dtype will be ``float32``.
- This may require copying data and coercing values, which may be
- expensive.
- Parameters
- ----------
- dtype : str or numpy.dtype, optional
- The dtype to pass to :meth:`numpy.asarray`.
- copy : bool, default False
- Whether to ensure that the returned value is a not a view on
- another array. Note that ``copy=False`` does not *ensure* that
- ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
- a copy is made, even if not strictly necessary.
- Returns
- -------
- numpy.ndarray
- See Also
- --------
- Series.to_numpy : Similar method for Series.
- Examples
- --------
- >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
- array([[1, 3],
- [2, 4]])
- With heterogeneous data, the lowest common type will have to
- be used.
- >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
- >>> df.to_numpy()
- array([[1. , 3. ],
- [2. , 4.5]])
- For a mix of numeric and non-numeric types, the output array will
- have object dtype.
- >>> df['C'] = pd.date_range('2000', periods=2)
- >>> df.to_numpy()
- array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
- [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
- """
- result = np.array(self.values, dtype=dtype, copy=copy)
- return result
- def to_dict(self, orient="dict", into=dict):
- """
- Convert the DataFrame to a dictionary.
- The type of the key-value pairs can be customized with the parameters
- (see below).
- Parameters
- ----------
- orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
- Determines the type of the values of the dictionary.
- - 'dict' (default) : dict like {column -> {index -> value}}
- - 'list' : dict like {column -> [values]}
- - 'series' : dict like {column -> Series(values)}
- - 'split' : dict like
- {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
- - 'records' : list like
- [{column -> value}, ... , {column -> value}]
- - 'index' : dict like {index -> {column -> value}}
- Abbreviations are allowed. `s` indicates `series` and `sp`
- indicates `split`.
- into : class, default dict
- The collections.abc.Mapping subclass used for all Mappings
- in the return value. Can be the actual class or an empty
- instance of the mapping type you want. If you want a
- collections.defaultdict, you must pass it initialized.
- .. versionadded:: 0.21.0
- Returns
- -------
- dict, list or collections.abc.Mapping
- Return a collections.abc.Mapping object representing the DataFrame.
- The resulting transformation depends on the `orient` parameter.
- See Also
- --------
- DataFrame.from_dict: Create a DataFrame from a dictionary.
- DataFrame.to_json: Convert a DataFrame to JSON format.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2],
- ... 'col2': [0.5, 0.75]},
- ... index=['row1', 'row2'])
- >>> df
- col1 col2
- row1 1 0.50
- row2 2 0.75
- >>> df.to_dict()
- {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
- You can specify the return orientation.
- >>> df.to_dict('series')
- {'col1': row1 1
- row2 2
- Name: col1, dtype: int64,
- 'col2': row1 0.50
- row2 0.75
- Name: col2, dtype: float64}
- >>> df.to_dict('split')
- {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
- 'data': [[1, 0.5], [2, 0.75]]}
- >>> df.to_dict('records')
- [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
- >>> df.to_dict('index')
- {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
- You can also specify the mapping type.
- >>> from collections import OrderedDict, defaultdict
- >>> df.to_dict(into=OrderedDict)
- OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
- ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
- If you want a `defaultdict`, you need to initialize it:
- >>> dd = defaultdict(list)
- >>> df.to_dict('records', into=dd)
- [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
- defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
- """
- if not self.columns.is_unique:
- warnings.warn(
- "DataFrame columns are not unique, some columns will be omitted.",
- UserWarning,
- stacklevel=2,
- )
- # GH16122
- into_c = com.standardize_mapping(into)
- if orient.lower().startswith("d"):
- return into_c((k, v.to_dict(into)) for k, v in self.items())
- elif orient.lower().startswith("l"):
- return into_c((k, v.tolist()) for k, v in self.items())
- elif orient.lower().startswith("sp"):
- return into_c(
- (
- ("index", self.index.tolist()),
- ("columns", self.columns.tolist()),
- (
- "data",
- [
- list(map(com.maybe_box_datetimelike, t))
- for t in self.itertuples(index=False, name=None)
- ],
- ),
- )
- )
- elif orient.lower().startswith("s"):
- return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items())
- elif orient.lower().startswith("r"):
- columns = self.columns.tolist()
- rows = (
- dict(zip(columns, row))
- for row in self.itertuples(index=False, name=None)
- )
- return [
- into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items())
- for row in rows
- ]
- elif orient.lower().startswith("i"):
- if not self.index.is_unique:
- raise ValueError("DataFrame index must be unique for orient='index'.")
- return into_c(
- (t[0], dict(zip(self.columns, t[1:])))
- for t in self.itertuples(name=None)
- )
- else:
- raise ValueError(f"orient '{orient}' not understood")
- def to_gbq(
- self,
- destination_table,
- project_id=None,
- chunksize=None,
- reauth=False,
- if_exists="fail",
- auth_local_webserver=False,
- table_schema=None,
- location=None,
- progress_bar=True,
- credentials=None,
- ) -> None:
- """
- Write a DataFrame to a Google BigQuery table.
- This function requires the `pandas-gbq package
- <https://pandas-gbq.readthedocs.io>`__.
- See the `How to authenticate with Google BigQuery
- <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
- guide for authentication instructions.
- Parameters
- ----------
- destination_table : str
- Name of table to be written, in the form ``dataset.tablename``.
- project_id : str, optional
- Google BigQuery Account project ID. Optional when available from
- the environment.
- chunksize : int, optional
- Number of rows to be inserted in each chunk from the dataframe.
- Set to ``None`` to load the whole dataframe at once.
- reauth : bool, default False
- Force Google BigQuery to re-authenticate the user. This is useful
- if multiple accounts are used.
- if_exists : str, default 'fail'
- Behavior when the destination table exists. Value can be one of:
- ``'fail'``
- If table exists raise pandas_gbq.gbq.TableCreationError.
- ``'replace'``
- If table exists, drop it, recreate it, and insert data.
- ``'append'``
- If table exists, insert data. Create if does not exist.
- auth_local_webserver : bool, default False
- Use the `local webserver flow`_ instead of the `console flow`_
- when getting user credentials.
- .. _local webserver flow:
- http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
- .. _console flow:
- http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
- *New in version 0.2.0 of pandas-gbq*.
- table_schema : list of dicts, optional
- List of BigQuery table fields to which according DataFrame
- columns conform to, e.g. ``[{'name': 'col1', 'type':
- 'STRING'},...]``. If schema is not provided, it will be
- generated according to dtypes of DataFrame columns. See
- BigQuery API documentation on available names of a field.
- *New in version 0.3.1 of pandas-gbq*.
- location : str, optional
- Location where the load job should run. See the `BigQuery locations
- documentation
- <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
- list of available locations. The location must match that of the
- target dataset.
- *New in version 0.5.0 of pandas-gbq*.
- progress_bar : bool, default True
- Use the library `tqdm` to show the progress bar for the upload,
- chunk by chunk.
- *New in version 0.5.0 of pandas-gbq*.
- credentials : google.auth.credentials.Credentials, optional
- Credentials for accessing Google APIs. Use this parameter to
- override default credentials, such as to use Compute Engine
- :class:`google.auth.compute_engine.Credentials` or Service
- Account :class:`google.oauth2.service_account.Credentials`
- directly.
- *New in version 0.8.0 of pandas-gbq*.
- .. versionadded:: 0.24.0
- See Also
- --------
- pandas_gbq.to_gbq : This function in the pandas-gbq library.
- read_gbq : Read a DataFrame from Google BigQuery.
- """
- from pandas.io import gbq
- gbq.to_gbq(
- self,
- destination_table,
- project_id=project_id,
- chunksize=chunksize,
- reauth=reauth,
- if_exists=if_exists,
- auth_local_webserver=auth_local_webserver,
- table_schema=table_schema,
- location=location,
- progress_bar=progress_bar,
- credentials=credentials,
- )
- @classmethod
- def from_records(
- cls,
- data,
- index=None,
- exclude=None,
- columns=None,
- coerce_float=False,
- nrows=None,
- ) -> "DataFrame":
- """
- Convert structured or record ndarray to DataFrame.
- Parameters
- ----------
- data : ndarray (structured dtype), list of tuples, dict, or DataFrame
- index : str, list of fields, array-like
- Field of array to use as the index, alternately a specific set of
- input labels to use.
- exclude : sequence, default None
- Columns or fields to exclude.
- columns : sequence, default None
- Column names to use. If the passed data do not have names
- associated with them, this argument provides names for the
- columns. Otherwise this argument indicates the order of the columns
- in the result (any names not found in the data will become all-NA
- columns).
- coerce_float : bool, default False
- Attempt to convert values of non-string, non-numeric objects (like
- decimal.Decimal) to floating point, useful for SQL result sets.
- nrows : int, default None
- Number of rows to read if data is an iterator.
- Returns
- -------
- DataFrame
- """
- # Make a copy of the input columns so we can modify it
- if columns is not None:
- columns = ensure_index(columns)
- if is_iterator(data):
- if nrows == 0:
- return cls()
- try:
- first_row = next(data)
- except StopIteration:
- return cls(index=index, columns=columns)
- dtype = None
- if hasattr(first_row, "dtype") and first_row.dtype.names:
- dtype = first_row.dtype
- values = [first_row]
- if nrows is None:
- values += data
- else:
- values.extend(itertools.islice(data, nrows - 1))
- if dtype is not None:
- data = np.array(values, dtype=dtype)
- else:
- data = values
- if isinstance(data, dict):
- if columns is None:
- columns = arr_columns = ensure_index(sorted(data))
- arrays = [data[k] for k in columns]
- else:
- arrays = []
- arr_columns = []
- for k, v in data.items():
- if k in columns:
- arr_columns.append(k)
- arrays.append(v)
- arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns)
- elif isinstance(data, (np.ndarray, DataFrame)):
- arrays, columns = to_arrays(data, columns)
- if columns is not None:
- columns = ensure_index(columns)
- arr_columns = columns
- else:
- arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float)
- arr_columns = ensure_index(arr_columns)
- if columns is not None:
- columns = ensure_index(columns)
- else:
- columns = arr_columns
- if exclude is None:
- exclude = set()
- else:
- exclude = set(exclude)
- result_index = None
- if index is not None:
- if isinstance(index, str) or not hasattr(index, "__iter__"):
- i = columns.get_loc(index)
- exclude.add(index)
- if len(arrays) > 0:
- result_index = Index(arrays[i], name=index)
- else:
- result_index = Index([], name=index)
- else:
- try:
- index_data = [arrays[arr_columns.get_loc(field)] for field in index]
- except (KeyError, TypeError):
- # raised by get_loc, see GH#29258
- result_index = index
- else:
- result_index = ensure_index_from_sequences(index_data, names=index)
- exclude.update(index)
- if any(exclude):
- arr_exclude = [x for x in exclude if x in arr_columns]
- to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
- arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
- arr_columns = arr_columns.drop(arr_exclude)
- columns = columns.drop(exclude)
- mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
- return cls(mgr)
- def to_records(
- self, index=True, column_dtypes=None, index_dtypes=None
- ) -> np.recarray:
- """
- Convert DataFrame to a NumPy record array.
- Index will be included as the first field of the record array if
- requested.
- Parameters
- ----------
- index : bool, default True
- Include index in resulting record array, stored in 'index'
- field or using the index label, if set.
- column_dtypes : str, type, dict, default None
- .. versionadded:: 0.24.0
- If a string or type, the data type to store all columns. If
- a dictionary, a mapping of column names and indices (zero-indexed)
- to specific data types.
- index_dtypes : str, type, dict, default None
- .. versionadded:: 0.24.0
- If a string or type, the data type to store all index levels. If
- a dictionary, a mapping of index level names and indices
- (zero-indexed) to specific data types.
- This mapping is applied only if `index=True`.
- Returns
- -------
- numpy.recarray
- NumPy ndarray with the DataFrame labels as fields and each row
- of the DataFrame as entries.
- See Also
- --------
- DataFrame.from_records: Convert structured or record ndarray
- to DataFrame.
- numpy.recarray: An ndarray that allows field access using
- attributes, analogous to typed columns in a
- spreadsheet.
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
- ... index=['a', 'b'])
- >>> df
- A B
- a 1 0.50
- b 2 0.75
- >>> df.to_records()
- rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
- dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
- If the DataFrame index has no label then the recarray field name
- is set to 'index'. If the index has a label then this is used as the
- field name:
- >>> df.index = df.index.rename("I")
- >>> df.to_records()
- rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
- dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
- The index can be excluded from the record array:
- >>> df.to_records(index=False)
- rec.array([(1, 0.5 ), (2, 0.75)],
- dtype=[('A', '<i8'), ('B', '<f8')])
- Data types can be specified for the columns:
- >>> df.to_records(column_dtypes={"A": "int32"})
- rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
- dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
- As well as for the index:
- >>> df.to_records(index_dtypes="<S2")
- rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
- dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
- >>> index_dtypes = f"<S{df.index.str.len().max()}"
- >>> df.to_records(index_dtypes=index_dtypes)
- rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
- dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
- """
- if index:
- if isinstance(self.index, ABCMultiIndex):
- # array of tuples to numpy cols. copy copy copy
- ix_vals = list(map(np.array, zip(*self.index.values)))
- else:
- ix_vals = [self.index.values]
- arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns]
- count = 0
- index_names = list(self.index.names)
- if isinstance(self.index, ABCMultiIndex):
- for i, n in enumerate(index_names):
- if n is None:
- index_names[i] = f"level_{count}"
- count += 1
- elif index_names[0] is None:
- index_names = ["index"]
- names = [str(name) for name in itertools.chain(index_names, self.columns)]
- else:
- arrays = [self[c]._internal_get_values() for c in self.columns]
- names = [str(c) for c in self.columns]
- index_names = []
- index_len = len(index_names)
- formats = []
- for i, v in enumerate(arrays):
- index = i
- # When the names and arrays are collected, we
- # first collect those in the DataFrame's index,
- # followed by those in its columns.
- #
- # Thus, the total length of the array is:
- # len(index_names) + len(DataFrame.columns).
- #
- # This check allows us to see whether we are
- # handling a name / array in the index or column.
- if index < index_len:
- dtype_mapping = index_dtypes
- name = index_names[index]
- else:
- index -= index_len
- dtype_mapping = column_dtypes
- name = self.columns[index]
- # We have a dictionary, so we get the data type
- # associated with the index or column (which can
- # be denoted by its name in the DataFrame or its
- # position in DataFrame's array of indices or
- # columns, whichever is applicable.
- if is_dict_like(dtype_mapping):
- if name in dtype_mapping:
- dtype_mapping = dtype_mapping[name]
- elif index in dtype_mapping:
- dtype_mapping = dtype_mapping[index]
- else:
- dtype_mapping = None
- # If no mapping can be found, use the array's
- # dtype attribute for formatting.
- #
- # A valid dtype must either be a type or
- # string naming a type.
- if dtype_mapping is None:
- formats.append(v.dtype)
- elif isinstance(dtype_mapping, (type, np.dtype, str)):
- formats.append(dtype_mapping)
- else:
- element = "row" if i < index_len else "column"
- msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
- raise ValueError(msg)
- return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
- @classmethod
- def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame":
- mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
- return cls(mgr)
- @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
- def to_stata(
- self,
- path,
- convert_dates=None,
- write_index=True,
- byteorder=None,
- time_stamp=None,
- data_label=None,
- variable_labels=None,
- version=114,
- convert_strl=None,
- ):
- """
- Export DataFrame object to Stata dta format.
- Writes the DataFrame to a Stata dataset file.
- "dta" files contain a Stata dataset.
- Parameters
- ----------
- path : str, buffer or path object
- String, path object (pathlib.Path or py._path.local.LocalPath) or
- object implementing a binary write() function. If using a buffer
- then the buffer will not be automatically closed after the file
- data has been written.
- .. versionchanged:: 1.0.0
- Previously this was "fname"
- convert_dates : dict
- Dictionary mapping columns containing datetime types to stata
- internal format to use when writing the dates. Options are 'tc',
- 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
- or a name. Datetime columns that do not have a conversion type
- specified will be converted to 'tc'. Raises NotImplementedError if
- a datetime column has timezone information.
- write_index : bool
- Write the index to Stata dataset.
- byteorder : str
- Can be ">", "<", "little", or "big". default is `sys.byteorder`.
- time_stamp : datetime
- A datetime to use as file creation date. Default is the current
- time.
- data_label : str, optional
- A label for the data set. Must be 80 characters or smaller.
- variable_labels : dict
- Dictionary containing columns as keys and variable labels as
- values. Each label must be 80 characters or smaller.
- version : {114, 117, 118, 119, None}, default 114
- Version to use in the output dta file. Set to None to let pandas
- decide between 118 or 119 formats depending on the number of
- columns in the frame. Version 114 can be read by Stata 10 and
- later. Version 117 can be read by Stata 13 or later. Version 118
- is supported in Stata 14 and later. Version 119 is supported in
- Stata 15 and later. Version 114 limits string variables to 244
- characters or fewer while versions 117 and later allow strings
- with lengths up to 2,000,000 characters. Versions 118 and 119
- support Unicode characters, and version 119 supports more than
- 32,767 variables.
- .. versionadded:: 0.23.0
- .. versionchanged:: 1.0.0
- Added support for formats 118 and 119.
- convert_strl : list, optional
- List of column names to convert to string columns to Stata StrL
- format. Only available if version is 117. Storing strings in the
- StrL format can produce smaller dta files if strings have more than
- 8 characters and values are repeated.
- .. versionadded:: 0.23.0
- Raises
- ------
- NotImplementedError
- * If datetimes contain timezone information
- * Column dtype is not representable in Stata
- ValueError
- * Columns listed in convert_dates are neither datetime64[ns]
- or datetime.datetime
- * Column listed in convert_dates is not in DataFrame
- * Categorical label contains more than 32,000 characters
- See Also
- --------
- read_stata : Import Stata data files.
- io.stata.StataWriter : Low-level writer for Stata data files.
- io.stata.StataWriter117 : Low-level writer for version 117 files.
- Examples
- --------
- >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',
- ... 'parrot'],
- ... 'speed': [350, 18, 361, 15]})
- >>> df.to_stata('animals.dta') # doctest: +SKIP
- """
- if version not in (114, 117, 118, 119, None):
- raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
- if version == 114:
- if convert_strl is not None:
- raise ValueError("strl is not supported in format 114")
- from pandas.io.stata import StataWriter as statawriter
- elif version == 117:
- from pandas.io.stata import StataWriter117 as statawriter
- else: # versions 118 and 119
- from pandas.io.stata import StataWriterUTF8 as statawriter
- kwargs = {}
- if version is None or version >= 117:
- # strl conversion is only supported >= 117
- kwargs["convert_strl"] = convert_strl
- if version is None or version >= 118:
- # Specifying the version is only supported for UTF8 (118 or 119)
- kwargs["version"] = version
- writer = statawriter(
- path,
- self,
- convert_dates=convert_dates,
- byteorder=byteorder,
- time_stamp=time_stamp,
- data_label=data_label,
- write_index=write_index,
- variable_labels=variable_labels,
- **kwargs,
- )
- writer.write_file()
- @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
- def to_feather(self, path) -> None:
- """
- Write out the binary feather-format for DataFrames.
- Parameters
- ----------
- path : str
- String file path.
- """
- from pandas.io.feather_format import to_feather
- to_feather(self, path)
- @Appender(
- """
- Examples
- --------
- >>> df = pd.DataFrame(
- ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
- ... )
- >>> print(df.to_markdown())
- | | animal_1 | animal_2 |
- |---:|:-----------|:-----------|
- | 0 | elk | dog |
- | 1 | pig | quetzal |
- """
- )
- @Substitution(klass="DataFrame")
- @Appender(_shared_docs["to_markdown"])
- def to_markdown(
- self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs
- ) -> Optional[str]:
- kwargs.setdefault("headers", "keys")
- kwargs.setdefault("tablefmt", "pipe")
- tabulate = import_optional_dependency("tabulate")
- result = tabulate.tabulate(self, **kwargs)
- if buf is None:
- return result
- buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode)
- assert buf is not None # Help mypy.
- buf.writelines(result)
- return None
- @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
- def to_parquet(
- self,
- path,
- engine="auto",
- compression="snappy",
- index=None,
- partition_cols=None,
- **kwargs,
- ) -> None:
- """
- Write a DataFrame to the binary parquet format.
- .. versionadded:: 0.21.0
- This function writes the dataframe as a `parquet file
- <https://parquet.apache.org/>`_. You can choose different parquet
- backends, and have the option of compression. See
- :ref:`the user guide <io.parquet>` for more details.
- Parameters
- ----------
- path : str
- File path or Root Directory path. Will be used as Root Directory
- path while writing a partitioned dataset.
- .. versionchanged:: 1.0.0
- Previously this was "fname"
- engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
- Parquet library to use. If 'auto', then the option
- ``io.parquet.engine`` is used. The default ``io.parquet.engine``
- behavior is to try 'pyarrow', falling back to 'fastparquet' if
- 'pyarrow' is unavailable.
- compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
- Name of the compression to use. Use ``None`` for no compression.
- index : bool, default None
- If ``True``, include the dataframe's index(es) in the file output.
- If ``False``, they will not be written to the file.
- If ``None``, similar to ``True`` the dataframe's index(es)
- will be saved. However, instead of being saved as values,
- the RangeIndex will be stored as a range in the metadata so it
- doesn't require much space and is faster. Other indexes will
- be included as columns in the file output.
- .. versionadded:: 0.24.0
- partition_cols : list, optional, default None
- Column names by which to partition the dataset.
- Columns are partitioned in the order they are given.
- .. versionadded:: 0.24.0
- **kwargs
- Additional arguments passed to the parquet library. See
- :ref:`pandas io <io.parquet>` for more details.
- See Also
- --------
- read_parquet : Read a parquet file.
- DataFrame.to_csv : Write a csv file.
- DataFrame.to_sql : Write to a sql table.
- DataFrame.to_hdf : Write to hdf.
- Notes
- -----
- This function requires either the `fastparquet
- <https://pypi.org/project/fastparquet>`_ or `pyarrow
- <https://arrow.apache.org/docs/python/>`_ library.
- Examples
- --------
- >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
- >>> df.to_parquet('df.parquet.gzip',
- ... compression='gzip') # doctest: +SKIP
- >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
- col1 col2
- 0 1 3
- 1 2 4
- """
- from pandas.io.parquet import to_parquet
- to_parquet(
- self,
- path,
- engine,
- compression=compression,
- index=index,
- partition_cols=partition_cols,
- **kwargs,
- )
- @Substitution(
- header_type="bool",
- header="Whether to print column labels, default True",
- col_space_type="str or int",
- col_space="The minimum width of each column in CSS length "
- "units. An int is assumed to be px units.\n\n"
- " .. versionadded:: 0.25.0\n"
- " Ability to use str",
- )
- @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
- def to_html(
- self,
- buf=None,
- columns=None,
- col_space=None,
- header=True,
- index=True,
- na_rep="NaN",
- formatters=None,
- float_format=None,
- sparsify=None,
- index_names=True,
- justify=None,
- max_rows=None,
- max_cols=None,
- show_dimensions=False,
- decimal=".",
- bold_rows=True,
- classes=None,
- escape=True,
- notebook=False,
- border=None,
- table_id=None,
- render_links=False,
- encoding=None,
- ):
- """
- Render a DataFrame as an HTML table.
- %(shared_params)s
- bold_rows : bool, default True
- Make the row labels bold in the output.
- classes : str or list or tuple, default None
- CSS class(es) to apply to the resulting html table.
- escape : bool, default True
- Convert the characters <, >, and & to HTML-safe sequences.
- notebook : {True, False}, default False
- Whether the generated HTML is for IPython Notebook.
- border : int
- A ``border=border`` attribute is included in the opening
- `<table>` tag. Default ``pd.options.display.html.border``.
- encoding : str, default "utf-8"
- Set character encoding.
- .. versionadded:: 1.0
- table_id : str, optional
- A css id is included in the opening `<table>` tag if specified.
- .. versionadded:: 0.23.0
- render_links : bool, default False
- Convert URLs to HTML links.
- .. versionadded:: 0.24.0
- %(returns)s
- See Also
- --------
- to_string : Convert DataFrame to a string.
- """
- if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
- raise ValueError("Invalid value for justify parameter")
- formatter = fmt.DataFrameFormatter(
- self,
- columns=columns,
- col_space=col_space,
- na_rep=na_rep,
- formatters=formatters,
- float_format=float_format,
- sparsify=sparsify,
- justify=justify,
- index_names=index_names,
- header=header,
- index=index,
- bold_rows=bold_rows,
- escape=escape,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=decimal,
- table_id=table_id,
- render_links=render_links,
- )
- # TODO: a generic formatter wld b in DataFrameFormatter
- return formatter.to_html(
- buf=buf,
- classes=classes,
- notebook=notebook,
- border=border,
- encoding=encoding,
- )
- # ----------------------------------------------------------------------
- def info(
- self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None
- ) -> None:
- """
- Print a concise summary of a DataFrame.
- This method prints information about a DataFrame including
- the index dtype and column dtypes, non-null values and memory usage.
- Parameters
- ----------
- verbose : bool, optional
- Whether to print the full summary. By default, the setting in
- ``pandas.options.display.max_info_columns`` is followed.
- buf : writable buffer, defaults to sys.stdout
- Where to send the output. By default, the output is printed to
- sys.stdout. Pass a writable buffer if you need to further process
- the output.
- max_cols : int, optional
- When to switch from the verbose to the truncated output. If the
- DataFrame has more than `max_cols` columns, the truncated output
- is used. By default, the setting in
- ``pandas.options.display.max_info_columns`` is used.
- memory_usage : bool, str, optional
- Specifies whether total memory usage of the DataFrame
- elements (including the index) should be displayed. By default,
- this follows the ``pandas.options.display.memory_usage`` setting.
- True always show memory usage. False never shows memory usage.
- A value of 'deep' is equivalent to "True with deep introspection".
- Memory usage is shown in human-readable units (base-2
- representation). Without deep introspection a memory estimation is
- made based in column dtype and number of rows assuming values
- consume the same memory amount for corresponding dtypes. With deep
- memory introspection, a real memory usage calculation is performed
- at the cost of computational resources.
- null_counts : bool, optional
- Whether to show the non-null counts. By default, this is shown
- only if the frame is smaller than
- ``pandas.options.display.max_info_rows`` and
- ``pandas.options.display.max_info_columns``. A value of True always
- shows the counts, and False never shows the counts.
- Returns
- -------
- None
- This method prints a summary of a DataFrame and returns None.
- See Also
- --------
- DataFrame.describe: Generate descriptive statistics of DataFrame
- columns.
- DataFrame.memory_usage: Memory usage of DataFrame columns.
- Examples
- --------
- >>> int_values = [1, 2, 3, 4, 5]
- >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
- >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
- >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
- ... "float_col": float_values})
- >>> df
- int_col text_col float_col
- 0 1 alpha 0.00
- 1 2 beta 0.25
- 2 3 gamma 0.50
- 3 4 delta 0.75
- 4 5 epsilon 1.00
- Prints information of all columns:
- >>> df.info(verbose=True)
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 5 entries, 0 to 4
- Data columns (total 3 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 int_col 5 non-null int64
- 1 text_col 5 non-null object
- 2 float_col 5 non-null float64
- dtypes: float64(1), int64(1), object(1)
- memory usage: 248.0+ bytes
- Prints a summary of columns count and its dtypes but not per column
- information:
- >>> df.info(verbose=False)
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 5 entries, 0 to 4
- Columns: 3 entries, int_col to float_col
- dtypes: float64(1), int64(1), object(1)
- memory usage: 248.0+ bytes
- Pipe output of DataFrame.info to buffer instead of sys.stdout, get
- buffer content and writes to a text file:
- >>> import io
- >>> buffer = io.StringIO()
- >>> df.info(buf=buffer)
- >>> s = buffer.getvalue()
- >>> with open("df_info.txt", "w",
- ... encoding="utf-8") as f: # doctest: +SKIP
- ... f.write(s)
- 260
- The `memory_usage` parameter allows deep introspection mode, specially
- useful for big DataFrames and fine-tune memory optimization:
- >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
- >>> df = pd.DataFrame({
- ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
- ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
- ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
- ... })
- >>> df.info()
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 1000000 entries, 0 to 999999
- Data columns (total 3 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 column_1 1000000 non-null object
- 1 column_2 1000000 non-null object
- 2 column_3 1000000 non-null object
- dtypes: object(3)
- memory usage: 22.9+ MB
- >>> df.info(memory_usage='deep')
- <class 'pandas.core.frame.DataFrame'>
- RangeIndex: 1000000 entries, 0 to 999999
- Data columns (total 3 columns):
- # Column Non-Null Count Dtype
- --- ------ -------------- -----
- 0 column_1 1000000 non-null object
- 1 column_2 1000000 non-null object
- 2 column_3 1000000 non-null object
- dtypes: object(3)
- memory usage: 188.8 MB
- """
- if buf is None: # pragma: no cover
- buf = sys.stdout
- lines = []
- lines.append(str(type(self)))
- lines.append(self.index._summary())
- if len(self.columns) == 0:
- lines.append(f"Empty {type(self).__name__}")
- fmt.buffer_put_lines(buf, lines)
- return
- cols = self.columns
- col_count = len(self.columns)
- # hack
- if max_cols is None:
- max_cols = get_option("display.max_info_columns", len(self.columns) + 1)
- max_rows = get_option("display.max_info_rows", len(self) + 1)
- if null_counts is None:
- show_counts = (col_count <= max_cols) and (len(self) < max_rows)
- else:
- show_counts = null_counts
- exceeds_info_cols = col_count > max_cols
- def _verbose_repr():
- lines.append(f"Data columns (total {len(self.columns)} columns):")
- id_head = " # "
- column_head = "Column"
- col_space = 2
- max_col = max(len(pprint_thing(k)) for k in cols)
- len_column = len(pprint_thing(column_head))
- space = max(max_col, len_column) + col_space
- max_id = len(pprint_thing(col_count))
- len_id = len(pprint_thing(id_head))
- space_num = max(max_id, len_id) + col_space
- counts = None
- header = _put_str(id_head, space_num) + _put_str(column_head, space)
- if show_counts:
- counts = self.count()
- if len(cols) != len(counts): # pragma: no cover
- raise AssertionError(
- f"Columns must equal counts ({len(cols)} != {len(counts)})"
- )
- count_header = "Non-Null Count"
- len_count = len(count_header)
- non_null = " non-null"
- max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
- space_count = max(len_count, max_count) + col_space
- count_temp = "{count}" + non_null
- else:
- count_header = ""
- space_count = len(count_header)
- len_count = space_count
- count_temp = "{count}"
- dtype_header = "Dtype"
- len_dtype = len(dtype_header)
- max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
- space_dtype = max(len_dtype, max_dtypes)
- header += _put_str(count_header, space_count) + _put_str(
- dtype_header, space_dtype
- )
- lines.append(header)
- lines.append(
- _put_str("-" * len_id, space_num)
- + _put_str("-" * len_column, space)
- + _put_str("-" * len_count, space_count)
- + _put_str("-" * len_dtype, space_dtype)
- )
- for i, col in enumerate(self.columns):
- dtype = self.dtypes.iloc[i]
- col = pprint_thing(col)
- line_no = _put_str(" {num}".format(num=i), space_num)
- count = ""
- if show_counts:
- count = counts.iloc[i]
- lines.append(
- line_no
- + _put_str(col, space)
- + _put_str(count_temp.format(count=count), space_count)
- + _put_str(dtype, space_dtype)
- )
- def _non_verbose_repr():
- lines.append(self.columns._summary(name="Columns"))
- def _sizeof_fmt(num, size_qualifier):
- # returns size in human readable format
- for x in ["bytes", "KB", "MB", "GB", "TB"]:
- if num < 1024.0:
- return f"{num:3.1f}{size_qualifier} {x}"
- num /= 1024.0
- return f"{num:3.1f}{size_qualifier} PB"
- if verbose:
- _verbose_repr()
- elif verbose is False: # specifically set to False, not nesc None
- _non_verbose_repr()
- else:
- if exceeds_info_cols:
- _non_verbose_repr()
- else:
- _verbose_repr()
- counts = self._data.get_dtype_counts()
- dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())]
- lines.append(f"dtypes: {', '.join(dtypes)}")
- if memory_usage is None:
- memory_usage = get_option("display.memory_usage")
- if memory_usage:
- # append memory usage of df to display
- size_qualifier = ""
- if memory_usage == "deep":
- deep = True
- else:
- # size_qualifier is just a best effort; not guaranteed to catch
- # all cases (e.g., it misses categorical data even with object
- # categories)
- deep = False
- if "object" in counts or self.index._is_memory_usage_qualified():
- size_qualifier = "+"
- mem_usage = self.memory_usage(index=True, deep=deep).sum()
- lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n")
- fmt.buffer_put_lines(buf, lines)
- def memory_usage(self, index=True, deep=False) -> Series:
- """
- Return the memory usage of each column in bytes.
- The memory usage can optionally include the contribution of
- the index and elements of `object` dtype.
- This value is displayed in `DataFrame.info` by default. This can be
- suppressed by setting ``pandas.options.display.memory_usage`` to False.
- Parameters
- ----------
- index : bool, default True
- Specifies whether to include the memory usage of the DataFrame's
- index in returned Series. If ``index=True``, the memory usage of
- the index is the first item in the output.
- deep : bool, default False
- If True, introspect the data deeply by interrogating
- `object` dtypes for system-level memory consumption, and include
- it in the returned values.
- Returns
- -------
- Series
- A Series whose index is the original column names and whose values
- is the memory usage of each column in bytes.
- See Also
- --------
- numpy.ndarray.nbytes : Total bytes consumed by the elements of an
- ndarray.
- Series.memory_usage : Bytes consumed by a Series.
- Categorical : Memory-efficient array for string values with
- many repeated values.
- DataFrame.info : Concise summary of a DataFrame.
- Examples
- --------
- >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
- >>> data = dict([(t, np.ones(shape=5000).astype(t))
- ... for t in dtypes])
- >>> df = pd.DataFrame(data)
- >>> df.head()
- int64 float64 complex128 object bool
- 0 1 1.0 1.000000+0.000000j 1 True
- 1 1 1.0 1.000000+0.000000j 1 True
- 2 1 1.0 1.000000+0.000000j 1 True
- 3 1 1.0 1.000000+0.000000j 1 True
- 4 1 1.0 1.000000+0.000000j 1 True
- >>> df.memory_usage()
- Index 128
- int64 40000
- float64 40000
- complex128 80000
- object 40000
- bool 5000
- dtype: int64
- >>> df.memory_usage(index=False)
- int64 40000
- float64 40000
- complex128 80000
- object 40000
- bool 5000
- dtype: int64
- The memory footprint of `object` dtype columns is ignored by default:
- >>> df.memory_usage(deep=True)
- Index 128
- int64 40000
- float64 40000
- complex128 80000
- object 160000
- bool 5000
- dtype: int64
- Use a Categorical for efficient storage of an object-dtype column with
- many repeated values.
- >>> df['object'].astype('category').memory_usage(deep=True)
- 5216
- """
- result = Series(
- [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
- index=self.columns,
- )
- if index:
- result = Series(self.index.memory_usage(deep=deep), index=["Index"]).append(
- result
- )
- return result
- def transpose(self, *args, copy: bool = False) -> "DataFrame":
- """
- Transpose index and columns.
- Reflect the DataFrame over its main diagonal by writing rows as columns
- and vice-versa. The property :attr:`.T` is an accessor to the method
- :meth:`transpose`.
- Parameters
- ----------
- *args : tuple, optional
- Accepted for compatibility with NumPy.
- copy : bool, default False
- Whether to copy the data after transposing, even for DataFrames
- with a single dtype.
- Note that a copy is always required for mixed dtype DataFrames,
- or for DataFrames with any extension types.
- Returns
- -------
- DataFrame
- The transposed DataFrame.
- See Also
- --------
- numpy.transpose : Permute the dimensions of a given array.
- Notes
- -----
- Transposing a DataFrame with mixed dtypes will result in a homogeneous
- DataFrame with the `object` dtype. In such a case, a copy of the data
- is always made.
- Examples
- --------
- **Square DataFrame with homogeneous dtype**
- >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df1 = pd.DataFrame(data=d1)
- >>> df1
- col1 col2
- 0 1 3
- 1 2 4
- >>> df1_transposed = df1.T # or df1.transpose()
- >>> df1_transposed
- 0 1
- col1 1 2
- col2 3 4
- When the dtype is homogeneous in the original DataFrame, we get a
- transposed DataFrame with the same dtype:
- >>> df1.dtypes
- col1 int64
- col2 int64
- dtype: object
- >>> df1_transposed.dtypes
- 0 int64
- 1 int64
- dtype: object
- **Non-square DataFrame with mixed dtypes**
- >>> d2 = {'name': ['Alice', 'Bob'],
- ... 'score': [9.5, 8],
- ... 'employed': [False, True],
- ... 'kids': [0, 0]}
- >>> df2 = pd.DataFrame(data=d2)
- >>> df2
- name score employed kids
- 0 Alice 9.5 False 0
- 1 Bob 8.0 True 0
- >>> df2_transposed = df2.T # or df2.transpose()
- >>> df2_transposed
- 0 1
- name Alice Bob
- score 9.5 8
- employed False True
- kids 0 0
- When the DataFrame has mixed dtypes, we get a transposed DataFrame with
- the `object` dtype:
- >>> df2.dtypes
- name object
- score float64
- employed bool
- kids int64
- dtype: object
- >>> df2_transposed.dtypes
- 0 object
- 1 object
- dtype: object
- """
- nv.validate_transpose(args, dict())
- # construct the args
- dtypes = list(self.dtypes)
- if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]):
- # We have EAs with the same dtype. We can preserve that dtype in transpose.
- dtype = dtypes[0]
- arr_type = dtype.construct_array_type()
- values = self.values
- new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
- result = self._constructor(
- dict(zip(self.index, new_values)), index=self.columns
- )
- else:
- new_values = self.values.T
- if copy:
- new_values = new_values.copy()
- result = self._constructor(
- new_values, index=self.columns, columns=self.index
- )
- return result.__finalize__(self)
- T = property(transpose)
- # ----------------------------------------------------------------------
- # Indexing Methods
- def _ixs(self, i: int, axis: int = 0):
- """
- Parameters
- ----------
- i : int
- axis : int
- Notes
- -----
- If slice passed, the resulting data will be a view.
- """
- # irow
- if axis == 0:
- new_values = self._data.fast_xs(i)
- # if we are a copy, mark as such
- copy = isinstance(new_values, np.ndarray) and new_values.base is None
- result = self._constructor_sliced(
- new_values,
- index=self.columns,
- name=self.index[i],
- dtype=new_values.dtype,
- )
- result._set_is_copy(self, copy=copy)
- return result
- # icol
- else:
- label = self.columns[i]
- # if the values returned are not the same length
- # as the index (iow a not found value), iget returns
- # a 0-len ndarray. This is effectively catching
- # a numpy error (as numpy should really raise)
- values = self._data.iget(i)
- if len(self.index) and not len(values):
- values = np.array([np.nan] * len(self.index), dtype=object)
- result = self._box_col_values(values, label)
- # this is a cached value, mark it so
- result._set_as_cached(label, self)
- return result
- def __getitem__(self, key):
- key = lib.item_from_zerodim(key)
- key = com.apply_if_callable(key, self)
- if is_hashable(key):
- # shortcut if the key is in columns
- if self.columns.is_unique and key in self.columns:
- if self.columns.nlevels > 1:
- return self._getitem_multilevel(key)
- return self._get_item_cache(key)
- # Do we have a slicer (on rows)?
- indexer = convert_to_index_sliceable(self, key)
- if indexer is not None:
- # either we have a slice or we have a string that can be converted
- # to a slice for partial-string date indexing
- return self._slice(indexer, axis=0)
- # Do we have a (boolean) DataFrame?
- if isinstance(key, DataFrame):
- return self.where(key)
- # Do we have a (boolean) 1d indexer?
- if com.is_bool_indexer(key):
- return self._getitem_bool_array(key)
- # We are left with two options: a single key, and a collection of keys,
- # We interpret tuples as collections only for non-MultiIndex
- is_single_key = isinstance(key, tuple) or not is_list_like(key)
- if is_single_key:
- if self.columns.nlevels > 1:
- return self._getitem_multilevel(key)
- indexer = self.columns.get_loc(key)
- if is_integer(indexer):
- indexer = [indexer]
- else:
- if is_iterator(key):
- key = list(key)
- indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
- # take() does not accept boolean indexers
- if getattr(indexer, "dtype", None) == bool:
- indexer = np.where(indexer)[0]
- data = self._take_with_is_copy(indexer, axis=1)
- if is_single_key:
- # What does looking for a single key in a non-unique index return?
- # The behavior is inconsistent. It returns a Series, except when
- # - the key itself is repeated (test on data.shape, #9519), or
- # - we have a MultiIndex on columns (test on self.columns, #21309)
- if data.shape[1] == 1 and not isinstance(self.columns, ABCMultiIndex):
- data = data[key]
- return data
- def _getitem_bool_array(self, key):
- # also raises Exception if object array with NA values
- # warning here just in case -- previously __setitem__ was
- # reindexing but __getitem__ was not; it seems more reasonable to
- # go with the __setitem__ behavior since that is more consistent
- # with all other indexing behavior
- if isinstance(key, Series) and not key.index.equals(self.index):
- warnings.warn(
- "Boolean Series key will be reindexed to match DataFrame index.",
- UserWarning,
- stacklevel=3,
- )
- elif len(key) != len(self.index):
- raise ValueError(
- f"Item wrong length {len(key)} instead of {len(self.index)}."
- )
- # check_bool_indexer will throw exception if Series key cannot
- # be reindexed to match DataFrame rows
- key = check_bool_indexer(self.index, key)
- indexer = key.nonzero()[0]
- return self._take_with_is_copy(indexer, axis=0)
- def _getitem_multilevel(self, key):
- # self.columns is a MultiIndex
- loc = self.columns.get_loc(key)
- if isinstance(loc, (slice, Series, np.ndarray, Index)):
- new_columns = self.columns[loc]
- result_columns = maybe_droplevels(new_columns, key)
- if self._is_mixed_type:
- result = self.reindex(columns=new_columns)
- result.columns = result_columns
- else:
- new_values = self.values[:, loc]
- result = self._constructor(
- new_values, index=self.index, columns=result_columns
- )
- result = result.__finalize__(self)
- # If there is only one column being returned, and its name is
- # either an empty string, or a tuple with an empty string as its
- # first element, then treat the empty string as a placeholder
- # and return the column as if the user had provided that empty
- # string in the key. If the result is a Series, exclude the
- # implied empty string from its name.
- if len(result.columns) == 1:
- top = result.columns[0]
- if isinstance(top, tuple):
- top = top[0]
- if top == "":
- result = result[""]
- if isinstance(result, Series):
- result = self._constructor_sliced(
- result, index=self.index, name=key
- )
- result._set_is_copy(self)
- return result
- else:
- return self._get_item_cache(key)
- def _get_value(self, index, col, takeable: bool = False):
- """
- Quickly retrieve single value at passed column and index.
- Parameters
- ----------
- index : row label
- col : column label
- takeable : interpret the index/col as indexers, default False
- Returns
- -------
- scalar
- """
- if takeable:
- series = self._iget_item_cache(col)
- return com.maybe_box_datetimelike(series._values[index])
- series = self._get_item_cache(col)
- engine = self.index._engine
- try:
- return engine.get_value(series._values, index)
- except KeyError:
- # GH 20629
- if self.index.nlevels > 1:
- # partial indexing forbidden
- raise
- except (TypeError, ValueError):
- pass
- # we cannot handle direct indexing
- # use positional
- col = self.columns.get_loc(col)
- index = self.index.get_loc(index)
- return self._get_value(index, col, takeable=True)
- def __setitem__(self, key, value):
- key = com.apply_if_callable(key, self)
- # see if we can slice the rows
- indexer = convert_to_index_sliceable(self, key)
- if indexer is not None:
- # either we have a slice or we have a string that can be converted
- # to a slice for partial-string date indexing
- return self._setitem_slice(indexer, value)
- if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
- self._setitem_frame(key, value)
- elif isinstance(key, (Series, np.ndarray, list, Index)):
- self._setitem_array(key, value)
- else:
- # set column
- self._set_item(key, value)
- def _setitem_slice(self, key, value):
- # NB: we can't just use self.loc[key] = value because that
- # operates on labels and we need to operate positional for
- # backwards-compat, xref GH#31469
- self._check_setitem_copy()
- self.loc._setitem_with_indexer(key, value)
- def _setitem_array(self, key, value):
- # also raises Exception if object array with NA values
- if com.is_bool_indexer(key):
- if len(key) != len(self.index):
- raise ValueError(
- f"Item wrong length {len(key)} instead of {len(self.index)}!"
- )
- key = check_bool_indexer(self.index, key)
- indexer = key.nonzero()[0]
- self._check_setitem_copy()
- self.loc._setitem_with_indexer(indexer, value)
- else:
- if isinstance(value, DataFrame):
- if len(value.columns) != len(key):
- raise ValueError("Columns must be same length as key")
- for k1, k2 in zip(key, value.columns):
- self[k1] = value[k2]
- else:
- indexer = self.loc._get_listlike_indexer(
- key, axis=1, raise_missing=False
- )[1]
- self._check_setitem_copy()
- self.loc._setitem_with_indexer((slice(None), indexer), value)
- def _setitem_frame(self, key, value):
- # support boolean setting with DataFrame input, e.g.
- # df[df > df2] = 0
- if isinstance(key, np.ndarray):
- if key.shape != self.shape:
- raise ValueError("Array conditional must be same shape as self")
- key = self._constructor(key, **self._construct_axes_dict())
- if key.values.size and not is_bool_dtype(key.values):
- raise TypeError(
- "Must pass DataFrame or 2-d ndarray with boolean values only"
- )
- self._check_inplace_setting(value)
- self._check_setitem_copy()
- self._where(-key, value, inplace=True)
- def _set_item(self, key, value):
- """
- Add series to DataFrame in specified column.
- If series is a numpy-array (not a Series/TimeSeries), it must be the
- same length as the DataFrames index or an error will be thrown.
- Series/TimeSeries will be conformed to the DataFrames index to
- ensure homogeneity.
- """
- self._ensure_valid_index(value)
- value = self._sanitize_column(key, value)
- NDFrame._set_item(self, key, value)
- # check if we are modifying a copy
- # try to set first as we want an invalid
- # value exception to occur first
- if len(self):
- self._check_setitem_copy()
- def _set_value(self, index, col, value, takeable: bool = False):
- """
- Put single value at passed column and index.
- Parameters
- ----------
- index : row label
- col : column label
- value : scalar
- takeable : interpret the index/col as indexers, default False
- Returns
- -------
- DataFrame
- If label pair is contained, will be reference to calling DataFrame,
- otherwise a new object.
- """
- try:
- if takeable is True:
- series = self._iget_item_cache(col)
- return series._set_value(index, value, takeable=True)
- series = self._get_item_cache(col)
- engine = self.index._engine
- engine.set_value(series._values, index, value)
- return self
- except (KeyError, TypeError):
- # set using a non-recursive method & reset the cache
- if takeable:
- self.iloc[index, col] = value
- else:
- self.loc[index, col] = value
- self._item_cache.pop(col, None)
- return self
- def _ensure_valid_index(self, value):
- """
- Ensure that if we don't have an index, that we can create one from the
- passed value.
- """
- # GH5632, make sure that we are a Series convertible
- if not len(self.index) and is_list_like(value) and len(value):
- try:
- value = Series(value)
- except (ValueError, NotImplementedError, TypeError):
- raise ValueError(
- "Cannot set a frame with no defined index "
- "and a value that cannot be converted to a "
- "Series"
- )
- self._data = self._data.reindex_axis(
- value.index.copy(), axis=1, fill_value=np.nan
- )
- def _box_item_values(self, key, values):
- items = self.columns[self.columns.get_loc(key)]
- if values.ndim == 2:
- return self._constructor(values.T, columns=items, index=self.index)
- else:
- return self._box_col_values(values, items)
- def _box_col_values(self, values, items):
- """
- Provide boxed values for a column.
- """
- klass = self._constructor_sliced
- return klass(values, index=self.index, name=items, fastpath=True)
- # ----------------------------------------------------------------------
- # Unsorted
- def query(self, expr, inplace=False, **kwargs):
- """
- Query the columns of a DataFrame with a boolean expression.
- Parameters
- ----------
- expr : str
- The query string to evaluate.
- You can refer to variables
- in the environment by prefixing them with an '@' character like
- ``@a + b``.
- You can refer to column names that contain spaces or operators by
- surrounding them in backticks. This way you can also escape
- names that start with a digit, or those that are a Python keyword.
- Basically when it is not valid Python identifier. See notes down
- for more details.
- For example, if one of your columns is called ``a a`` and you want
- to sum it with ``b``, your query should be ```a a` + b``.
- .. versionadded:: 0.25.0
- Backtick quoting introduced.
- .. versionadded:: 1.0.0
- Expanding functionality of backtick quoting for more than only spaces.
- inplace : bool
- Whether the query should modify the data in place or return
- a modified copy.
- **kwargs
- See the documentation for :func:`eval` for complete details
- on the keyword arguments accepted by :meth:`DataFrame.query`.
- Returns
- -------
- DataFrame
- DataFrame resulting from the provided query expression.
- See Also
- --------
- eval : Evaluate a string describing operations on
- DataFrame columns.
- DataFrame.eval : Evaluate a string describing operations on
- DataFrame columns.
- Notes
- -----
- The result of the evaluation of this expression is first passed to
- :attr:`DataFrame.loc` and if that fails because of a
- multidimensional key (e.g., a DataFrame) then the result will be passed
- to :meth:`DataFrame.__getitem__`.
- This method uses the top-level :func:`eval` function to
- evaluate the passed query.
- The :meth:`~pandas.DataFrame.query` method uses a slightly
- modified Python syntax by default. For example, the ``&`` and ``|``
- (bitwise) operators have the precedence of their boolean cousins,
- :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
- however the semantics are different.
- You can change the semantics of the expression by passing the keyword
- argument ``parser='python'``. This enforces the same semantics as
- evaluation in Python space. Likewise, you can pass ``engine='python'``
- to evaluate an expression using Python itself as a backend. This is not
- recommended as it is inefficient compared to using ``numexpr`` as the
- engine.
- The :attr:`DataFrame.index` and
- :attr:`DataFrame.columns` attributes of the
- :class:`~pandas.DataFrame` instance are placed in the query namespace
- by default, which allows you to treat both the index and columns of the
- frame as a column in the frame.
- The identifier ``index`` is used for the frame index; you can also
- use the name of the index to identify it in a query. Please note that
- Python keywords may not be used as identifiers.
- For further details and examples see the ``query`` documentation in
- :ref:`indexing <indexing.query>`.
- *Backtick quoted variables*
- Backtick quoted variables are parsed as literal Python code and
- are converted internally to a Python valid identifier.
- This can lead to the following problems.
- During parsing a number of disallowed characters inside the backtick
- quoted string are replaced by strings that are allowed as a Python identifier.
- These characters include all operators in Python, the space character, the
- question mark, the exclamation mark, the dollar sign, and the euro sign.
- For other characters that fall outside the ASCII range (U+0001..U+007F)
- and those that are not further specified in PEP 3131,
- the query parser will raise an error.
- This excludes whitespace different than the space character,
- but also the hashtag (as it is used for comments) and the backtick
- itself (backtick can also not be escaped).
- In a special case, quotes that make a pair around a backtick can
- confuse the parser.
- For example, ```it's` > `that's``` will raise an error,
- as it forms a quoted string (``'s > `that'``) with a backtick inside.
- See also the Python documentation about lexical analysis
- (https://docs.python.org/3/reference/lexical_analysis.html)
- in combination with the source code in :mod:`pandas.core.computation.parsing`.
- Examples
- --------
- >>> df = pd.DataFrame({'A': range(1, 6),
- ... 'B': range(10, 0, -2),
- ... 'C C': range(10, 5, -1)})
- >>> df
- A B C C
- 0 1 10 10
- 1 2 8 9
- 2 3 6 8
- 3 4 4 7
- 4 5 2 6
- >>> df.query('A > B')
- A B C C
- 4 5 2 6
- The previous expression is equivalent to
- >>> df[df.A > df.B]
- A B C C
- 4 5 2 6
- For columns with spaces in their name, you can use backtick quoting.
- >>> df.query('B == `C C`')
- A B C C
- 0 1 10 10
- The previous expression is equivalent to
- >>> df[df.B == df['C C']]
- A B C C
- 0 1 10 10
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- if not isinstance(expr, str):
- msg = f"expr must be a string to be evaluated, {type(expr)} given"
- raise ValueError(msg)
- kwargs["level"] = kwargs.pop("level", 0) + 1
- kwargs["target"] = None
- res = self.eval(expr, **kwargs)
- try:
- new_data = self.loc[res]
- except ValueError:
- # when res is multi-dimensional loc raises, but this is sometimes a
- # valid query
- new_data = self[res]
- if inplace:
- self._update_inplace(new_data)
- else:
- return new_data
- def eval(self, expr, inplace=False, **kwargs):
- """
- Evaluate a string describing operations on DataFrame columns.
- Operates on columns only, not specific rows or elements. This allows
- `eval` to run arbitrary code, which can make you vulnerable to code
- injection if you pass user input to this function.
- Parameters
- ----------
- expr : str
- The expression string to evaluate.
- inplace : bool, default False
- If the expression contains an assignment, whether to perform the
- operation inplace and mutate the existing DataFrame. Otherwise,
- a new DataFrame is returned.
- **kwargs
- See the documentation for :func:`eval` for complete details
- on the keyword arguments accepted by
- :meth:`~pandas.DataFrame.query`.
- Returns
- -------
- ndarray, scalar, or pandas object
- The result of the evaluation.
- See Also
- --------
- DataFrame.query : Evaluates a boolean expression to query the columns
- of a frame.
- DataFrame.assign : Can evaluate an expression or function to create new
- values for a column.
- eval : Evaluate a Python expression as a string using various
- backends.
- Notes
- -----
- For more details see the API documentation for :func:`~eval`.
- For detailed examples see :ref:`enhancing performance with eval
- <enhancingperf.eval>`.
- Examples
- --------
- >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
- >>> df
- A B
- 0 1 10
- 1 2 8
- 2 3 6
- 3 4 4
- 4 5 2
- >>> df.eval('A + B')
- 0 11
- 1 10
- 2 9
- 3 8
- 4 7
- dtype: int64
- Assignment is allowed though by default the original DataFrame is not
- modified.
- >>> df.eval('C = A + B')
- A B C
- 0 1 10 11
- 1 2 8 10
- 2 3 6 9
- 3 4 4 8
- 4 5 2 7
- >>> df
- A B
- 0 1 10
- 1 2 8
- 2 3 6
- 3 4 4
- 4 5 2
- Use ``inplace=True`` to modify the original DataFrame.
- >>> df.eval('C = A + B', inplace=True)
- >>> df
- A B C
- 0 1 10 11
- 1 2 8 10
- 2 3 6 9
- 3 4 4 8
- 4 5 2 7
- """
- from pandas.core.computation.eval import eval as _eval
- inplace = validate_bool_kwarg(inplace, "inplace")
- resolvers = kwargs.pop("resolvers", None)
- kwargs["level"] = kwargs.pop("level", 0) + 1
- if resolvers is None:
- index_resolvers = self._get_index_resolvers()
- column_resolvers = self._get_cleaned_column_resolvers()
- resolvers = column_resolvers, index_resolvers
- if "target" not in kwargs:
- kwargs["target"] = self
- kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
- return _eval(expr, inplace=inplace, **kwargs)
- def select_dtypes(self, include=None, exclude=None) -> "DataFrame":
- """
- Return a subset of the DataFrame's columns based on the column dtypes.
- Parameters
- ----------
- include, exclude : scalar or list-like
- A selection of dtypes or strings to be included/excluded. At least
- one of these parameters must be supplied.
- Returns
- -------
- DataFrame
- The subset of the frame including the dtypes in ``include`` and
- excluding the dtypes in ``exclude``.
- Raises
- ------
- ValueError
- * If both of ``include`` and ``exclude`` are empty
- * If ``include`` and ``exclude`` have overlapping elements
- * If any kind of string dtype is passed in.
- Notes
- -----
- * To select all *numeric* types, use ``np.number`` or ``'number'``
- * To select strings you must use the ``object`` dtype, but note that
- this will return *all* object dtype columns
- * See the `numpy dtype hierarchy
- <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
- * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
- ``'datetime64'``
- * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
- ``'timedelta64'``
- * To select Pandas categorical dtypes, use ``'category'``
- * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
- 0.20.0) or ``'datetime64[ns, tz]'``
- Examples
- --------
- >>> df = pd.DataFrame({'a': [1, 2] * 3,
- ... 'b': [True, False] * 3,
- ... 'c': [1.0, 2.0] * 3})
- >>> df
- a b c
- 0 1 True 1.0
- 1 2 False 2.0
- 2 1 True 1.0
- 3 2 False 2.0
- 4 1 True 1.0
- 5 2 False 2.0
- >>> df.select_dtypes(include='bool')
- b
- 0 True
- 1 False
- 2 True
- 3 False
- 4 True
- 5 False
- >>> df.select_dtypes(include=['float64'])
- c
- 0 1.0
- 1 2.0
- 2 1.0
- 3 2.0
- 4 1.0
- 5 2.0
- >>> df.select_dtypes(exclude=['int'])
- b c
- 0 True 1.0
- 1 False 2.0
- 2 True 1.0
- 3 False 2.0
- 4 True 1.0
- 5 False 2.0
- """
- if not is_list_like(include):
- include = (include,) if include is not None else ()
- if not is_list_like(exclude):
- exclude = (exclude,) if exclude is not None else ()
- selection = (frozenset(include), frozenset(exclude))
- if not any(selection):
- raise ValueError("at least one of include or exclude must be nonempty")
- # convert the myriad valid dtypes object to a single representation
- include = frozenset(infer_dtype_from_object(x) for x in include)
- exclude = frozenset(infer_dtype_from_object(x) for x in exclude)
- for dtypes in (include, exclude):
- invalidate_string_dtypes(dtypes)
- # can't both include AND exclude!
- if not include.isdisjoint(exclude):
- raise ValueError(f"include and exclude overlap on {(include & exclude)}")
- # We raise when both include and exclude are empty
- # Hence, we can just shrink the columns we want to keep
- keep_these = np.full(self.shape[1], True)
- def extract_unique_dtypes_from_dtypes_set(
- dtypes_set: FrozenSet[Dtype], unique_dtypes: np.ndarray
- ) -> List[Dtype]:
- extracted_dtypes = [
- unique_dtype
- for unique_dtype in unique_dtypes
- if issubclass(unique_dtype.type, tuple(dtypes_set)) # type: ignore
- ]
- return extracted_dtypes
- unique_dtypes = self.dtypes.unique()
- if include:
- included_dtypes = extract_unique_dtypes_from_dtypes_set(
- include, unique_dtypes
- )
- keep_these &= self.dtypes.isin(included_dtypes)
- if exclude:
- excluded_dtypes = extract_unique_dtypes_from_dtypes_set(
- exclude, unique_dtypes
- )
- keep_these &= ~self.dtypes.isin(excluded_dtypes)
- return self.iloc[:, keep_these.values]
- def insert(self, loc, column, value, allow_duplicates=False) -> None:
- """
- Insert column into DataFrame at specified location.
- Raises a ValueError if `column` is already contained in the DataFrame,
- unless `allow_duplicates` is set to True.
- Parameters
- ----------
- loc : int
- Insertion index. Must verify 0 <= loc <= len(columns).
- column : str, number, or hashable object
- Label of the inserted column.
- value : int, Series, or array-like
- allow_duplicates : bool, optional
- """
- self._ensure_valid_index(value)
- value = self._sanitize_column(column, value, broadcast=False)
- self._data.insert(loc, column, value, allow_duplicates=allow_duplicates)
- def assign(self, **kwargs) -> "DataFrame":
- r"""
- Assign new columns to a DataFrame.
- Returns a new object with all original columns in addition to new ones.
- Existing columns that are re-assigned will be overwritten.
- Parameters
- ----------
- **kwargs : dict of {str: callable or Series}
- The column names are keywords. If the values are
- callable, they are computed on the DataFrame and
- assigned to the new columns. The callable must not
- change input DataFrame (though pandas doesn't check it).
- If the values are not callable, (e.g. a Series, scalar, or array),
- they are simply assigned.
- Returns
- -------
- DataFrame
- A new DataFrame with the new columns in addition to
- all the existing columns.
- Notes
- -----
- Assigning multiple columns within the same ``assign`` is possible.
- Later items in '\*\*kwargs' may refer to newly created or modified
- columns in 'df'; items are computed and assigned into 'df' in order.
- .. versionchanged:: 0.23.0
- Keyword argument order is maintained.
- Examples
- --------
- >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
- ... index=['Portland', 'Berkeley'])
- >>> df
- temp_c
- Portland 17.0
- Berkeley 25.0
- Where the value is a callable, evaluated on `df`:
- >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
- temp_c temp_f
- Portland 17.0 62.6
- Berkeley 25.0 77.0
- Alternatively, the same behavior can be achieved by directly
- referencing an existing Series or sequence:
- >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
- temp_c temp_f
- Portland 17.0 62.6
- Berkeley 25.0 77.0
- You can create multiple columns within the same assign where one
- of the columns depends on another one defined within the same assign:
- >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
- ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
- temp_c temp_f temp_k
- Portland 17.0 62.6 290.15
- Berkeley 25.0 77.0 298.15
- """
- data = self.copy()
- for k, v in kwargs.items():
- data[k] = com.apply_if_callable(v, data)
- return data
- def _sanitize_column(self, key, value, broadcast=True):
- """
- Ensures new columns (which go into the BlockManager as new blocks) are
- always copied and converted into an array.
- Parameters
- ----------
- key : object
- value : scalar, Series, or array-like
- broadcast : bool, default True
- If ``key`` matches multiple duplicate column names in the
- DataFrame, this parameter indicates whether ``value`` should be
- tiled so that the returned array contains a (duplicated) column for
- each occurrence of the key. If False, ``value`` will not be tiled.
- Returns
- -------
- numpy.ndarray
- """
- def reindexer(value):
- # reindex if necessary
- if value.index.equals(self.index) or not len(self.index):
- value = value._values.copy()
- else:
- # GH 4107
- try:
- value = value.reindex(self.index)._values
- except ValueError as err:
- # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
- if not value.index.is_unique:
- # duplicate axis
- raise err
- # other
- raise TypeError(
- "incompatible index of inserted column with frame index"
- )
- return value
- if isinstance(value, Series):
- value = reindexer(value)
- elif isinstance(value, DataFrame):
- # align right-hand-side columns if self.columns
- # is multi-index and self[key] is a sub-frame
- if isinstance(self.columns, ABCMultiIndex) and key in self.columns:
- loc = self.columns.get_loc(key)
- if isinstance(loc, (slice, Series, np.ndarray, Index)):
- cols = maybe_droplevels(self.columns[loc], key)
- if len(cols) and not cols.equals(value.columns):
- value = value.reindex(cols, axis=1)
- # now align rows
- value = reindexer(value).T
- elif isinstance(value, ExtensionArray):
- # Explicitly copy here, instead of in sanitize_index,
- # as sanitize_index won't copy an EA, even with copy=True
- value = value.copy()
- value = sanitize_index(value, self.index, copy=False)
- elif isinstance(value, Index) or is_sequence(value):
- # turn me into an ndarray
- value = sanitize_index(value, self.index, copy=False)
- if not isinstance(value, (np.ndarray, Index)):
- if isinstance(value, list) and len(value) > 0:
- value = maybe_convert_platform(value)
- else:
- value = com.asarray_tuplesafe(value)
- elif value.ndim == 2:
- value = value.copy().T
- elif isinstance(value, Index):
- value = value.copy(deep=True)
- else:
- value = value.copy()
- # possibly infer to datetimelike
- if is_object_dtype(value.dtype):
- value = maybe_infer_to_datetimelike(value)
- else:
- # cast ignores pandas dtypes. so save the dtype first
- infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True)
- # upcast
- value = cast_scalar_to_array(len(self.index), value)
- value = maybe_cast_to_datetime(value, infer_dtype)
- # return internal types directly
- if is_extension_array_dtype(value):
- return value
- # broadcast across multiple columns if necessary
- if broadcast and key in self.columns and value.ndim == 1:
- if not self.columns.is_unique or isinstance(self.columns, ABCMultiIndex):
- existing_piece = self[key]
- if isinstance(existing_piece, DataFrame):
- value = np.tile(value, (len(existing_piece.columns), 1))
- return np.atleast_2d(np.asarray(value))
- @property
- def _series(self):
- return {
- item: Series(self._data.iget(idx), index=self.index, name=item)
- for idx, item in enumerate(self.columns)
- }
- def lookup(self, row_labels, col_labels) -> np.ndarray:
- """
- Label-based "fancy indexing" function for DataFrame.
- Given equal-length arrays of row and column labels, return an
- array of the values corresponding to each (row, col) pair.
- Parameters
- ----------
- row_labels : sequence
- The row labels to use for lookup.
- col_labels : sequence
- The column labels to use for lookup.
- Returns
- -------
- numpy.ndarray
- Examples
- --------
- values : ndarray
- The found values
- """
- n = len(row_labels)
- if n != len(col_labels):
- raise ValueError("Row labels must have same size as column labels")
- thresh = 1000
- if not self._is_mixed_type or n > thresh:
- values = self.values
- ridx = self.index.get_indexer(row_labels)
- cidx = self.columns.get_indexer(col_labels)
- if (ridx == -1).any():
- raise KeyError("One or more row labels was not found")
- if (cidx == -1).any():
- raise KeyError("One or more column labels was not found")
- flat_index = ridx * len(self.columns) + cidx
- result = values.flat[flat_index]
- else:
- result = np.empty(n, dtype="O")
- for i, (r, c) in enumerate(zip(row_labels, col_labels)):
- result[i] = self._get_value(r, c)
- if is_object_dtype(result):
- result = lib.maybe_convert_objects(result)
- return result
- # ----------------------------------------------------------------------
- # Reindexing and alignment
- def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
- frame = self
- columns = axes["columns"]
- if columns is not None:
- frame = frame._reindex_columns(
- columns, method, copy, level, fill_value, limit, tolerance
- )
- index = axes["index"]
- if index is not None:
- frame = frame._reindex_index(
- index, method, copy, level, fill_value, limit, tolerance
- )
- return frame
- def _reindex_index(
- self,
- new_index,
- method,
- copy,
- level,
- fill_value=np.nan,
- limit=None,
- tolerance=None,
- ):
- new_index, indexer = self.index.reindex(
- new_index, method=method, level=level, limit=limit, tolerance=tolerance
- )
- return self._reindex_with_indexers(
- {0: [new_index, indexer]},
- copy=copy,
- fill_value=fill_value,
- allow_dups=False,
- )
- def _reindex_columns(
- self,
- new_columns,
- method,
- copy,
- level,
- fill_value=None,
- limit=None,
- tolerance=None,
- ):
- new_columns, indexer = self.columns.reindex(
- new_columns, method=method, level=level, limit=limit, tolerance=tolerance
- )
- return self._reindex_with_indexers(
- {1: [new_columns, indexer]},
- copy=copy,
- fill_value=fill_value,
- allow_dups=False,
- )
- def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame":
- """
- We are guaranteed non-Nones in the axes.
- """
- new_index, row_indexer = self.index.reindex(axes["index"])
- new_columns, col_indexer = self.columns.reindex(axes["columns"])
- if row_indexer is not None and col_indexer is not None:
- indexer = row_indexer, col_indexer
- new_values = algorithms.take_2d_multi(
- self.values, indexer, fill_value=fill_value
- )
- return self._constructor(new_values, index=new_index, columns=new_columns)
- else:
- return self._reindex_with_indexers(
- {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
- copy=copy,
- fill_value=fill_value,
- )
- @Appender(_shared_docs["align"] % _shared_doc_kwargs)
- def align(
- self,
- other,
- join="outer",
- axis=None,
- level=None,
- copy=True,
- fill_value=None,
- method=None,
- limit=None,
- fill_axis=0,
- broadcast_axis=None,
- ) -> "DataFrame":
- return super().align(
- other,
- join=join,
- axis=axis,
- level=level,
- copy=copy,
- fill_value=fill_value,
- method=method,
- limit=limit,
- fill_axis=fill_axis,
- broadcast_axis=broadcast_axis,
- )
- @Substitution(**_shared_doc_kwargs)
- @Appender(NDFrame.reindex.__doc__)
- @rewrite_axis_style_signature(
- "labels",
- [
- ("method", None),
- ("copy", True),
- ("level", None),
- ("fill_value", np.nan),
- ("limit", None),
- ("tolerance", None),
- ],
- )
- def reindex(self, *args, **kwargs) -> "DataFrame":
- axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex")
- kwargs.update(axes)
- # Pop these, since the values are in `kwargs` under different names
- kwargs.pop("axis", None)
- kwargs.pop("labels", None)
- return super().reindex(**kwargs)
- def drop(
- self,
- labels=None,
- axis=0,
- index=None,
- columns=None,
- level=None,
- inplace=False,
- errors="raise",
- ):
- """
- Drop specified labels from rows or columns.
- Remove rows or columns by specifying label names and corresponding
- axis, or by specifying directly index or column names. When using a
- multi-index, labels on different levels can be removed by specifying
- the level.
- Parameters
- ----------
- labels : single label or list-like
- Index or column labels to drop.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Whether to drop labels from the index (0 or 'index') or
- columns (1 or 'columns').
- index : single label or list-like
- Alternative to specifying axis (``labels, axis=0``
- is equivalent to ``index=labels``).
- .. versionadded:: 0.21.0
- columns : single label or list-like
- Alternative to specifying axis (``labels, axis=1``
- is equivalent to ``columns=labels``).
- .. versionadded:: 0.21.0
- level : int or level name, optional
- For MultiIndex, level from which the labels will be removed.
- inplace : bool, default False
- If True, do operation inplace and return None.
- errors : {'ignore', 'raise'}, default 'raise'
- If 'ignore', suppress error and only existing labels are
- dropped.
- Returns
- -------
- DataFrame
- DataFrame without the removed index or column labels.
- Raises
- ------
- KeyError
- If any of the labels is not found in the selected axis.
- See Also
- --------
- DataFrame.loc : Label-location based indexer for selection by label.
- DataFrame.dropna : Return DataFrame with labels on given axis omitted
- where (all or any) data are missing.
- DataFrame.drop_duplicates : Return DataFrame with duplicate rows
- removed, optionally only considering certain columns.
- Series.drop : Return Series with specified index labels removed.
- Examples
- --------
- >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
- ... columns=['A', 'B', 'C', 'D'])
- >>> df
- A B C D
- 0 0 1 2 3
- 1 4 5 6 7
- 2 8 9 10 11
- Drop columns
- >>> df.drop(['B', 'C'], axis=1)
- A D
- 0 0 3
- 1 4 7
- 2 8 11
- >>> df.drop(columns=['B', 'C'])
- A D
- 0 0 3
- 1 4 7
- 2 8 11
- Drop a row by index
- >>> df.drop([0, 1])
- A B C D
- 2 8 9 10 11
- Drop columns and/or rows of MultiIndex DataFrame
- >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
- ... ['speed', 'weight', 'length']],
- ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
- ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
- >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
- ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
- ... [250, 150], [1.5, 0.8], [320, 250],
- ... [1, 0.8], [0.3, 0.2]])
- >>> df
- big small
- lama speed 45.0 30.0
- weight 200.0 100.0
- length 1.5 1.0
- cow speed 30.0 20.0
- weight 250.0 150.0
- length 1.5 0.8
- falcon speed 320.0 250.0
- weight 1.0 0.8
- length 0.3 0.2
- >>> df.drop(index='cow', columns='small')
- big
- lama speed 45.0
- weight 200.0
- length 1.5
- falcon speed 320.0
- weight 1.0
- length 0.3
- >>> df.drop(index='length', level=1)
- big small
- lama speed 45.0 30.0
- weight 200.0 100.0
- cow speed 30.0 20.0
- weight 250.0 150.0
- falcon speed 320.0 250.0
- weight 1.0 0.8
- """
- return super().drop(
- labels=labels,
- axis=axis,
- index=index,
- columns=columns,
- level=level,
- inplace=inplace,
- errors=errors,
- )
- @rewrite_axis_style_signature(
- "mapper",
- [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")],
- )
- def rename(
- self,
- mapper: Optional[Renamer] = None,
- *,
- index: Optional[Renamer] = None,
- columns: Optional[Renamer] = None,
- axis: Optional[Axis] = None,
- copy: bool = True,
- inplace: bool = False,
- level: Optional[Level] = None,
- errors: str = "ignore",
- ) -> Optional["DataFrame"]:
- """
- Alter axes labels.
- Function / dict values must be unique (1-to-1). Labels not contained in
- a dict / Series will be left as-is. Extra labels listed don't throw an
- error.
- See the :ref:`user guide <basics.rename>` for more.
- Parameters
- ----------
- mapper : dict-like or function
- Dict-like or functions transformations to apply to
- that axis' values. Use either ``mapper`` and ``axis`` to
- specify the axis to target with ``mapper``, or ``index`` and
- ``columns``.
- index : dict-like or function
- Alternative to specifying axis (``mapper, axis=0``
- is equivalent to ``index=mapper``).
- columns : dict-like or function
- Alternative to specifying axis (``mapper, axis=1``
- is equivalent to ``columns=mapper``).
- axis : int or str
- Axis to target with ``mapper``. Can be either the axis name
- ('index', 'columns') or number (0, 1). The default is 'index'.
- copy : bool, default True
- Also copy underlying data.
- inplace : bool, default False
- Whether to return a new DataFrame. If True then value of copy is
- ignored.
- level : int or level name, default None
- In case of a MultiIndex, only rename labels in the specified
- level.
- errors : {'ignore', 'raise'}, default 'ignore'
- If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
- or `columns` contains labels that are not present in the Index
- being transformed.
- If 'ignore', existing keys will be renamed and extra keys will be
- ignored.
- Returns
- -------
- DataFrame
- DataFrame with the renamed axis labels.
- Raises
- ------
- KeyError
- If any of the labels is not found in the selected axis and
- "errors='raise'".
- See Also
- --------
- DataFrame.rename_axis : Set the name of the axis.
- Examples
- --------
- ``DataFrame.rename`` supports two calling conventions
- * ``(index=index_mapper, columns=columns_mapper, ...)``
- * ``(mapper, axis={'index', 'columns'}, ...)``
- We *highly* recommend using keyword arguments to clarify your
- intent.
- Rename columns using a mapping:
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
- >>> df.rename(columns={"A": "a", "B": "c"})
- a c
- 0 1 4
- 1 2 5
- 2 3 6
- Rename index using a mapping:
- >>> df.rename(index={0: "x", 1: "y", 2: "z"})
- A B
- x 1 4
- y 2 5
- z 3 6
- Cast index labels to a different type:
- >>> df.index
- RangeIndex(start=0, stop=3, step=1)
- >>> df.rename(index=str).index
- Index(['0', '1', '2'], dtype='object')
- >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
- Traceback (most recent call last):
- KeyError: ['C'] not found in axis
- Using axis-style parameters
- >>> df.rename(str.lower, axis='columns')
- a b
- 0 1 4
- 1 2 5
- 2 3 6
- >>> df.rename({1: 2, 2: 4}, axis='index')
- A B
- 0 1 4
- 2 2 5
- 4 3 6
- """
- return super().rename(
- mapper=mapper,
- index=index,
- columns=columns,
- axis=axis,
- copy=copy,
- inplace=inplace,
- level=level,
- errors=errors,
- )
- @Substitution(**_shared_doc_kwargs)
- @Appender(NDFrame.fillna.__doc__)
- def fillna(
- self,
- value=None,
- method=None,
- axis=None,
- inplace=False,
- limit=None,
- downcast=None,
- ) -> Optional["DataFrame"]:
- return super().fillna(
- value=value,
- method=method,
- axis=axis,
- inplace=inplace,
- limit=limit,
- downcast=downcast,
- )
- @Appender(_shared_docs["replace"] % _shared_doc_kwargs)
- def replace(
- self,
- to_replace=None,
- value=None,
- inplace=False,
- limit=None,
- regex=False,
- method="pad",
- ):
- return super().replace(
- to_replace=to_replace,
- value=value,
- inplace=inplace,
- limit=limit,
- regex=regex,
- method=method,
- )
- @Appender(_shared_docs["shift"] % _shared_doc_kwargs)
- def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame":
- return super().shift(
- periods=periods, freq=freq, axis=axis, fill_value=fill_value
- )
- def set_index(
- self, keys, drop=True, append=False, inplace=False, verify_integrity=False
- ):
- """
- Set the DataFrame index using existing columns.
- Set the DataFrame index (row labels) using one or more existing
- columns or arrays (of the correct length). The index can replace the
- existing index or expand on it.
- Parameters
- ----------
- keys : label or array-like or list of labels/arrays
- This parameter can be either a single column key, a single array of
- the same length as the calling DataFrame, or a list containing an
- arbitrary combination of column keys and arrays. Here, "array"
- encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
- instances of :class:`~collections.abc.Iterator`.
- drop : bool, default True
- Delete columns to be used as the new index.
- append : bool, default False
- Whether to append columns to existing index.
- inplace : bool, default False
- Modify the DataFrame in place (do not create a new object).
- verify_integrity : bool, default False
- Check the new index for duplicates. Otherwise defer the check until
- necessary. Setting to False will improve the performance of this
- method.
- Returns
- -------
- DataFrame
- Changed row labels.
- See Also
- --------
- DataFrame.reset_index : Opposite of set_index.
- DataFrame.reindex : Change to new indices or expand indices.
- DataFrame.reindex_like : Change to same indices as other DataFrame.
- Examples
- --------
- >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
- ... 'year': [2012, 2014, 2013, 2014],
- ... 'sale': [55, 40, 84, 31]})
- >>> df
- month year sale
- 0 1 2012 55
- 1 4 2014 40
- 2 7 2013 84
- 3 10 2014 31
- Set the index to become the 'month' column:
- >>> df.set_index('month')
- year sale
- month
- 1 2012 55
- 4 2014 40
- 7 2013 84
- 10 2014 31
- Create a MultiIndex using columns 'year' and 'month':
- >>> df.set_index(['year', 'month'])
- sale
- year month
- 2012 1 55
- 2014 4 40
- 2013 7 84
- 2014 10 31
- Create a MultiIndex using an Index and a column:
- >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
- month sale
- year
- 1 2012 1 55
- 2 2014 4 40
- 3 2013 7 84
- 4 2014 10 31
- Create a MultiIndex using two Series:
- >>> s = pd.Series([1, 2, 3, 4])
- >>> df.set_index([s, s**2])
- month year sale
- 1 1 1 2012 55
- 2 4 4 2014 40
- 3 9 7 2013 84
- 4 16 10 2014 31
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- if not isinstance(keys, list):
- keys = [keys]
- err_msg = (
- 'The parameter "keys" may be a column key, one-dimensional '
- "array, or a list containing only valid column keys and "
- "one-dimensional arrays."
- )
- missing: List[Optional[Hashable]] = []
- for col in keys:
- if isinstance(
- col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator)
- ):
- # arrays are fine as long as they are one-dimensional
- # iterators get converted to list below
- if getattr(col, "ndim", 1) != 1:
- raise ValueError(err_msg)
- else:
- # everything else gets tried as a key; see GH 24969
- try:
- found = col in self.columns
- except TypeError:
- raise TypeError(f"{err_msg}. Received column of type {type(col)}")
- else:
- if not found:
- missing.append(col)
- if missing:
- raise KeyError(f"None of {missing} are in the columns")
- if inplace:
- frame = self
- else:
- frame = self.copy()
- arrays = []
- names = []
- if append:
- names = list(self.index.names)
- if isinstance(self.index, ABCMultiIndex):
- for i in range(self.index.nlevels):
- arrays.append(self.index._get_level_values(i))
- else:
- arrays.append(self.index)
- to_remove: List[Optional[Hashable]] = []
- for col in keys:
- if isinstance(col, ABCMultiIndex):
- for n in range(col.nlevels):
- arrays.append(col._get_level_values(n))
- names.extend(col.names)
- elif isinstance(col, (ABCIndexClass, ABCSeries)):
- # if Index then not MultiIndex (treated above)
- arrays.append(col)
- names.append(col.name)
- elif isinstance(col, (list, np.ndarray)):
- arrays.append(col)
- names.append(None)
- elif isinstance(col, abc.Iterator):
- arrays.append(list(col))
- names.append(None)
- # from here, col can only be a column label
- else:
- arrays.append(frame[col]._values)
- names.append(col)
- if drop:
- to_remove.append(col)
- if len(arrays[-1]) != len(self):
- # check newest element against length of calling frame, since
- # ensure_index_from_sequences would not raise for append=False.
- raise ValueError(
- f"Length mismatch: Expected {len(self)} rows, "
- f"received array of length {len(arrays[-1])}"
- )
- index = ensure_index_from_sequences(arrays, names)
- if verify_integrity and not index.is_unique:
- duplicates = index[index.duplicated()].unique()
- raise ValueError(f"Index has duplicate keys: {duplicates}")
- # use set to handle duplicate column names gracefully in case of drop
- for c in set(to_remove):
- del frame[c]
- # clear up memory usage
- index._cleanup()
- frame.index = index
- if not inplace:
- return frame
- def reset_index(
- self,
- level: Optional[Union[Hashable, Sequence[Hashable]]] = None,
- drop: bool = False,
- inplace: bool = False,
- col_level: Hashable = 0,
- col_fill: Optional[Hashable] = "",
- ) -> Optional["DataFrame"]:
- """
- Reset the index, or a level of it.
- Reset the index of the DataFrame, and use the default one instead.
- If the DataFrame has a MultiIndex, this method can remove one or more
- levels.
- Parameters
- ----------
- level : int, str, tuple, or list, default None
- Only remove the given levels from the index. Removes all levels by
- default.
- drop : bool, default False
- Do not try to insert index into dataframe columns. This resets
- the index to the default integer index.
- inplace : bool, default False
- Modify the DataFrame in place (do not create a new object).
- col_level : int or str, default 0
- If the columns have multiple levels, determines which level the
- labels are inserted into. By default it is inserted into the first
- level.
- col_fill : object, default ''
- If the columns have multiple levels, determines how the other
- levels are named. If None then the index name is repeated.
- Returns
- -------
- DataFrame or None
- DataFrame with the new index or None if ``inplace=True``.
- See Also
- --------
- DataFrame.set_index : Opposite of reset_index.
- DataFrame.reindex : Change to new indices or expand indices.
- DataFrame.reindex_like : Change to same indices as other DataFrame.
- Examples
- --------
- >>> df = pd.DataFrame([('bird', 389.0),
- ... ('bird', 24.0),
- ... ('mammal', 80.5),
- ... ('mammal', np.nan)],
- ... index=['falcon', 'parrot', 'lion', 'monkey'],
- ... columns=('class', 'max_speed'))
- >>> df
- class max_speed
- falcon bird 389.0
- parrot bird 24.0
- lion mammal 80.5
- monkey mammal NaN
- When we reset the index, the old index is added as a column, and a
- new sequential index is used:
- >>> df.reset_index()
- index class max_speed
- 0 falcon bird 389.0
- 1 parrot bird 24.0
- 2 lion mammal 80.5
- 3 monkey mammal NaN
- We can use the `drop` parameter to avoid the old index being added as
- a column:
- >>> df.reset_index(drop=True)
- class max_speed
- 0 bird 389.0
- 1 bird 24.0
- 2 mammal 80.5
- 3 mammal NaN
- You can also use `reset_index` with `MultiIndex`.
- >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
- ... ('bird', 'parrot'),
- ... ('mammal', 'lion'),
- ... ('mammal', 'monkey')],
- ... names=['class', 'name'])
- >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
- ... ('species', 'type')])
- >>> df = pd.DataFrame([(389.0, 'fly'),
- ... ( 24.0, 'fly'),
- ... ( 80.5, 'run'),
- ... (np.nan, 'jump')],
- ... index=index,
- ... columns=columns)
- >>> df
- speed species
- max type
- class name
- bird falcon 389.0 fly
- parrot 24.0 fly
- mammal lion 80.5 run
- monkey NaN jump
- If the index has multiple levels, we can reset a subset of them:
- >>> df.reset_index(level='class')
- class speed species
- max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
- If we are not dropping the index, by default, it is placed in the top
- level. We can place it in another level:
- >>> df.reset_index(level='class', col_level=1)
- speed species
- class max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
- When the index is inserted under another level, we can specify under
- which one with the parameter `col_fill`:
- >>> df.reset_index(level='class', col_level=1, col_fill='species')
- species speed species
- class max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
- If we specify a nonexistent level for `col_fill`, it is created:
- >>> df.reset_index(level='class', col_level=1, col_fill='genus')
- genus speed species
- class max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- if inplace:
- new_obj = self
- else:
- new_obj = self.copy()
- def _maybe_casted_values(index, labels=None):
- values = index._values
- if not isinstance(index, (PeriodIndex, DatetimeIndex)):
- if values.dtype == np.object_:
- values = lib.maybe_convert_objects(values)
- # if we have the labels, extract the values with a mask
- if labels is not None:
- mask = labels == -1
- # we can have situations where the whole mask is -1,
- # meaning there is nothing found in labels, so make all nan's
- if mask.all():
- values = np.empty(len(mask))
- values.fill(np.nan)
- else:
- values = values.take(labels)
- # TODO(https://github.com/pandas-dev/pandas/issues/24206)
- # Push this into maybe_upcast_putmask?
- # We can't pass EAs there right now. Looks a bit
- # complicated.
- # So we unbox the ndarray_values, op, re-box.
- values_type = type(values)
- values_dtype = values.dtype
- if issubclass(values_type, DatetimeLikeArray):
- values = values._data
- if mask.any():
- values, _ = maybe_upcast_putmask(values, mask, np.nan)
- if issubclass(values_type, DatetimeLikeArray):
- values = values_type(values, dtype=values_dtype)
- return values
- new_index = ibase.default_index(len(new_obj))
- if level is not None:
- if not isinstance(level, (tuple, list)):
- level = [level]
- level = [self.index._get_level_number(lev) for lev in level]
- if len(level) < self.index.nlevels:
- new_index = self.index.droplevel(level)
- if not drop:
- to_insert: Iterable[Tuple[Any, Optional[Any]]]
- if isinstance(self.index, ABCMultiIndex):
- names = [
- (n if n is not None else f"level_{i}")
- for i, n in enumerate(self.index.names)
- ]
- to_insert = zip(self.index.levels, self.index.codes)
- else:
- default = "index" if "index" not in self else "level_0"
- names = [default] if self.index.name is None else [self.index.name]
- to_insert = ((self.index, None),)
- multi_col = isinstance(self.columns, ABCMultiIndex)
- for i, (lev, lab) in reversed(list(enumerate(to_insert))):
- if not (level is None or i in level):
- continue
- name = names[i]
- if multi_col:
- col_name = list(name) if isinstance(name, tuple) else [name]
- if col_fill is None:
- if len(col_name) not in (1, self.columns.nlevels):
- raise ValueError(
- "col_fill=None is incompatible "
- f"with incomplete column name {name}"
- )
- col_fill = col_name[0]
- lev_num = self.columns._get_level_number(col_level)
- name_lst = [col_fill] * lev_num + col_name
- missing = self.columns.nlevels - len(name_lst)
- name_lst += [col_fill] * missing
- name = tuple(name_lst)
- # to ndarray and maybe infer different dtype
- level_values = _maybe_casted_values(lev, lab)
- new_obj.insert(0, name, level_values)
- new_obj.index = new_index
- if not inplace:
- return new_obj
- return None
- # ----------------------------------------------------------------------
- # Reindex-based selection methods
- @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
- def isna(self) -> "DataFrame":
- return super().isna()
- @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
- def isnull(self) -> "DataFrame":
- return super().isnull()
- @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
- def notna(self) -> "DataFrame":
- return super().notna()
- @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
- def notnull(self) -> "DataFrame":
- return super().notnull()
- def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
- """
- Remove missing values.
- See the :ref:`User Guide <missing_data>` for more on which values are
- considered missing, and how to work with missing data.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Determine if rows or columns which contain missing values are
- removed.
- * 0, or 'index' : Drop rows which contain missing values.
- * 1, or 'columns' : Drop columns which contain missing value.
- .. versionchanged:: 1.0.0
- Pass tuple or list to drop on multiple axes.
- Only a single axis is allowed.
- how : {'any', 'all'}, default 'any'
- Determine if row or column is removed from DataFrame, when we have
- at least one NA or all NA.
- * 'any' : If any NA values are present, drop that row or column.
- * 'all' : If all values are NA, drop that row or column.
- thresh : int, optional
- Require that many non-NA values.
- subset : array-like, optional
- Labels along other axis to consider, e.g. if you are dropping rows
- these would be a list of columns to include.
- inplace : bool, default False
- If True, do operation inplace and return None.
- Returns
- -------
- DataFrame
- DataFrame with NA entries dropped from it.
- See Also
- --------
- DataFrame.isna: Indicate missing values.
- DataFrame.notna : Indicate existing (non-missing) values.
- DataFrame.fillna : Replace missing values.
- Series.dropna : Drop missing values.
- Index.dropna : Drop missing indices.
- Examples
- --------
- >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
- ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
- ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
- ... pd.NaT]})
- >>> df
- name toy born
- 0 Alfred NaN NaT
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
- Drop the rows where at least one element is missing.
- >>> df.dropna()
- name toy born
- 1 Batman Batmobile 1940-04-25
- Drop the columns where at least one element is missing.
- >>> df.dropna(axis='columns')
- name
- 0 Alfred
- 1 Batman
- 2 Catwoman
- Drop the rows where all elements are missing.
- >>> df.dropna(how='all')
- name toy born
- 0 Alfred NaN NaT
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
- Keep only the rows with at least 2 non-NA values.
- >>> df.dropna(thresh=2)
- name toy born
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
- Define in which columns to look for missing values.
- >>> df.dropna(subset=['name', 'born'])
- name toy born
- 1 Batman Batmobile 1940-04-25
- Keep the DataFrame with valid entries in the same variable.
- >>> df.dropna(inplace=True)
- >>> df
- name toy born
- 1 Batman Batmobile 1940-04-25
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- if isinstance(axis, (tuple, list)):
- # GH20987
- raise TypeError("supplying multiple axes to axis is no longer supported.")
- axis = self._get_axis_number(axis)
- agg_axis = 1 - axis
- agg_obj = self
- if subset is not None:
- ax = self._get_axis(agg_axis)
- indices = ax.get_indexer_for(subset)
- check = indices == -1
- if check.any():
- raise KeyError(list(np.compress(check, subset)))
- agg_obj = self.take(indices, axis=agg_axis)
- count = agg_obj.count(axis=agg_axis)
- if thresh is not None:
- mask = count >= thresh
- elif how == "any":
- mask = count == len(agg_obj._get_axis(agg_axis))
- elif how == "all":
- mask = count > 0
- else:
- if how is not None:
- raise ValueError(f"invalid how option: {how}")
- else:
- raise TypeError("must specify how or thresh")
- result = self.loc(axis=axis)[mask]
- if inplace:
- self._update_inplace(result)
- else:
- return result
- def drop_duplicates(
- self,
- subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
- keep: Union[str, bool] = "first",
- inplace: bool = False,
- ignore_index: bool = False,
- ) -> Optional["DataFrame"]:
- """
- Return DataFrame with duplicate rows removed.
- Considering certain columns is optional. Indexes, including time indexes
- are ignored.
- Parameters
- ----------
- subset : column label or sequence of labels, optional
- Only consider certain columns for identifying duplicates, by
- default use all of the columns.
- keep : {'first', 'last', False}, default 'first'
- Determines which duplicates (if any) to keep.
- - ``first`` : Drop duplicates except for the first occurrence.
- - ``last`` : Drop duplicates except for the last occurrence.
- - False : Drop all duplicates.
- inplace : bool, default False
- Whether to drop duplicates in place or to return a copy.
- ignore_index : bool, default False
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
- .. versionadded:: 1.0.0
- Returns
- -------
- DataFrame
- DataFrame with duplicates removed or None if ``inplace=True``.
- """
- if self.empty:
- return self.copy()
- inplace = validate_bool_kwarg(inplace, "inplace")
- duplicated = self.duplicated(subset, keep=keep)
- if inplace:
- (inds,) = (-duplicated)._ndarray_values.nonzero()
- new_data = self._data.take(inds)
- if ignore_index:
- new_data.axes[1] = ibase.default_index(len(inds))
- self._update_inplace(new_data)
- else:
- result = self[-duplicated]
- if ignore_index:
- result.index = ibase.default_index(len(result))
- return result
- return None
- def duplicated(
- self,
- subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
- keep: Union[str, bool] = "first",
- ) -> "Series":
- """
- Return boolean Series denoting duplicate rows.
- Considering certain columns is optional.
- Parameters
- ----------
- subset : column label or sequence of labels, optional
- Only consider certain columns for identifying duplicates, by
- default use all of the columns.
- keep : {'first', 'last', False}, default 'first'
- Determines which duplicates (if any) to mark.
- - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
- - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
- - False : Mark all duplicates as ``True``.
- Returns
- -------
- Series
- """
- from pandas.core.sorting import get_group_index
- from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
- if self.empty:
- return Series(dtype=bool)
- def f(vals):
- labels, shape = algorithms.factorize(
- vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)
- )
- return labels.astype("i8", copy=False), len(shape)
- if subset is None:
- subset = self.columns
- elif (
- not np.iterable(subset)
- or isinstance(subset, str)
- or isinstance(subset, tuple)
- and subset in self.columns
- ):
- subset = (subset,)
- # needed for mypy since can't narrow types using np.iterable
- subset = cast(Iterable, subset)
- # Verify all columns in subset exist in the queried dataframe
- # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
- # key that doesn't exist.
- diff = Index(subset).difference(self.columns)
- if not diff.empty:
- raise KeyError(diff)
- vals = (col.values for name, col in self.items() if name in subset)
- labels, shape = map(list, zip(*map(f, vals)))
- ids = get_group_index(labels, shape, sort=False, xnull=False)
- return Series(duplicated_int64(ids, keep), index=self.index)
- # ----------------------------------------------------------------------
- # Sorting
- @Substitution(**_shared_doc_kwargs)
- @Appender(NDFrame.sort_values.__doc__)
- def sort_values(
- self,
- by,
- axis=0,
- ascending=True,
- inplace=False,
- kind="quicksort",
- na_position="last",
- ignore_index=False,
- ):
- inplace = validate_bool_kwarg(inplace, "inplace")
- axis = self._get_axis_number(axis)
- if not isinstance(by, list):
- by = [by]
- if is_sequence(ascending) and len(by) != len(ascending):
- raise ValueError(
- f"Length of ascending ({len(ascending)}) != length of by ({len(by)})"
- )
- if len(by) > 1:
- from pandas.core.sorting import lexsort_indexer
- keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
- indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position)
- indexer = ensure_platform_int(indexer)
- else:
- from pandas.core.sorting import nargsort
- by = by[0]
- k = self._get_label_or_level_values(by, axis=axis)
- if isinstance(ascending, (tuple, list)):
- ascending = ascending[0]
- indexer = nargsort(
- k, kind=kind, ascending=ascending, na_position=na_position
- )
- new_data = self._data.take(
- indexer, axis=self._get_block_manager_axis(axis), verify=False
- )
- if ignore_index:
- new_data.axes[1] = ibase.default_index(len(indexer))
- if inplace:
- return self._update_inplace(new_data)
- else:
- return self._constructor(new_data).__finalize__(self)
- @Substitution(**_shared_doc_kwargs)
- @Appender(NDFrame.sort_index.__doc__)
- def sort_index(
- self,
- axis=0,
- level=None,
- ascending=True,
- inplace=False,
- kind="quicksort",
- na_position="last",
- sort_remaining=True,
- ignore_index: bool = False,
- ):
- # TODO: this can be combined with Series.sort_index impl as
- # almost identical
- inplace = validate_bool_kwarg(inplace, "inplace")
- axis = self._get_axis_number(axis)
- labels = self._get_axis(axis)
- # make sure that the axis is lexsorted to start
- # if not we need to reconstruct to get the correct indexer
- labels = labels._sort_levels_monotonic()
- if level is not None:
- new_axis, indexer = labels.sortlevel(
- level, ascending=ascending, sort_remaining=sort_remaining
- )
- elif isinstance(labels, ABCMultiIndex):
- from pandas.core.sorting import lexsort_indexer
- indexer = lexsort_indexer(
- labels._get_codes_for_sorting(),
- orders=ascending,
- na_position=na_position,
- )
- else:
- from pandas.core.sorting import nargsort
- # Check monotonic-ness before sort an index
- # GH11080
- if (ascending and labels.is_monotonic_increasing) or (
- not ascending and labels.is_monotonic_decreasing
- ):
- if inplace:
- return
- else:
- return self.copy()
- indexer = nargsort(
- labels, kind=kind, ascending=ascending, na_position=na_position
- )
- baxis = self._get_block_manager_axis(axis)
- new_data = self._data.take(indexer, axis=baxis, verify=False)
- # reconstruct axis if needed
- new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
- if ignore_index:
- new_data.axes[1] = ibase.default_index(len(indexer))
- if inplace:
- return self._update_inplace(new_data)
- else:
- return self._constructor(new_data).__finalize__(self)
- def nlargest(self, n, columns, keep="first") -> "DataFrame":
- """
- Return the first `n` rows ordered by `columns` in descending order.
- Return the first `n` rows with the largest values in `columns`, in
- descending order. The columns that are not specified are returned as
- well, but not used for ordering.
- This method is equivalent to
- ``df.sort_values(columns, ascending=False).head(n)``, but more
- performant.
- Parameters
- ----------
- n : int
- Number of rows to return.
- columns : label or list of labels
- Column label(s) to order by.
- keep : {'first', 'last', 'all'}, default 'first'
- Where there are duplicate values:
- - `first` : prioritize the first occurrence(s)
- - `last` : prioritize the last occurrence(s)
- - ``all`` : do not drop any duplicates, even it means
- selecting more than `n` items.
- .. versionadded:: 0.24.0
- Returns
- -------
- DataFrame
- The first `n` rows ordered by the given columns in descending
- order.
- See Also
- --------
- DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
- ascending order.
- DataFrame.sort_values : Sort DataFrame by the values.
- DataFrame.head : Return the first `n` rows without re-ordering.
- Notes
- -----
- This function cannot be used with all column types. For example, when
- specifying columns with `object` or `category` dtypes, ``TypeError`` is
- raised.
- Examples
- --------
- >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
- ... 434000, 434000, 337000, 11300,
- ... 11300, 11300],
- ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
- ... 17036, 182, 38, 311],
- ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
- ... "IS", "NR", "TV", "AI"]},
- ... index=["Italy", "France", "Malta",
- ... "Maldives", "Brunei", "Iceland",
- ... "Nauru", "Tuvalu", "Anguilla"])
- >>> df
- population GDP alpha-2
- Italy 59000000 1937894 IT
- France 65000000 2583560 FR
- Malta 434000 12011 MT
- Maldives 434000 4520 MV
- Brunei 434000 12128 BN
- Iceland 337000 17036 IS
- Nauru 11300 182 NR
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- In the following example, we will use ``nlargest`` to select the three
- rows having the largest values in column "population".
- >>> df.nlargest(3, 'population')
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Malta 434000 12011 MT
- When using ``keep='last'``, ties are resolved in reverse order:
- >>> df.nlargest(3, 'population', keep='last')
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Brunei 434000 12128 BN
- When using ``keep='all'``, all duplicate items are maintained:
- >>> df.nlargest(3, 'population', keep='all')
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Malta 434000 12011 MT
- Maldives 434000 4520 MV
- Brunei 434000 12128 BN
- To order by the largest values in column "population" and then "GDP",
- we can specify multiple columns like in the next example.
- >>> df.nlargest(3, ['population', 'GDP'])
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Brunei 434000 12128 BN
- """
- return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
- def nsmallest(self, n, columns, keep="first") -> "DataFrame":
- """
- Return the first `n` rows ordered by `columns` in ascending order.
- Return the first `n` rows with the smallest values in `columns`, in
- ascending order. The columns that are not specified are returned as
- well, but not used for ordering.
- This method is equivalent to
- ``df.sort_values(columns, ascending=True).head(n)``, but more
- performant.
- Parameters
- ----------
- n : int
- Number of items to retrieve.
- columns : list or str
- Column name or names to order by.
- keep : {'first', 'last', 'all'}, default 'first'
- Where there are duplicate values:
- - ``first`` : take the first occurrence.
- - ``last`` : take the last occurrence.
- - ``all`` : do not drop any duplicates, even it means
- selecting more than `n` items.
- .. versionadded:: 0.24.0
- Returns
- -------
- DataFrame
- See Also
- --------
- DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
- descending order.
- DataFrame.sort_values : Sort DataFrame by the values.
- DataFrame.head : Return the first `n` rows without re-ordering.
- Examples
- --------
- >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
- ... 434000, 434000, 337000, 11300,
- ... 11300, 11300],
- ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
- ... 17036, 182, 38, 311],
- ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
- ... "IS", "NR", "TV", "AI"]},
- ... index=["Italy", "France", "Malta",
- ... "Maldives", "Brunei", "Iceland",
- ... "Nauru", "Tuvalu", "Anguilla"])
- >>> df
- population GDP alpha-2
- Italy 59000000 1937894 IT
- France 65000000 2583560 FR
- Malta 434000 12011 MT
- Maldives 434000 4520 MV
- Brunei 434000 12128 BN
- Iceland 337000 17036 IS
- Nauru 11300 182 NR
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- In the following example, we will use ``nsmallest`` to select the
- three rows having the smallest values in column "a".
- >>> df.nsmallest(3, 'population')
- population GDP alpha-2
- Nauru 11300 182 NR
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- When using ``keep='last'``, ties are resolved in reverse order:
- >>> df.nsmallest(3, 'population', keep='last')
- population GDP alpha-2
- Anguilla 11300 311 AI
- Tuvalu 11300 38 TV
- Nauru 11300 182 NR
- When using ``keep='all'``, all duplicate items are maintained:
- >>> df.nsmallest(3, 'population', keep='all')
- population GDP alpha-2
- Nauru 11300 182 NR
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- To order by the largest values in column "a" and then "c", we can
- specify multiple columns like in the next example.
- >>> df.nsmallest(3, ['population', 'GDP'])
- population GDP alpha-2
- Tuvalu 11300 38 TV
- Nauru 11300 182 NR
- Anguilla 11300 311 AI
- """
- return algorithms.SelectNFrame(
- self, n=n, keep=keep, columns=columns
- ).nsmallest()
- def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame":
- """
- Swap levels i and j in a MultiIndex on a particular axis.
- Parameters
- ----------
- i, j : int or str
- Levels of the indices to be swapped. Can pass level name as string.
- Returns
- -------
- DataFrame
- """
- result = self.copy()
- axis = self._get_axis_number(axis)
- if axis == 0:
- result.index = result.index.swaplevel(i, j)
- else:
- result.columns = result.columns.swaplevel(i, j)
- return result
- def reorder_levels(self, order, axis=0) -> "DataFrame":
- """
- Rearrange index levels using input order. May not drop or duplicate levels.
- Parameters
- ----------
- order : list of int or list of str
- List representing new level order. Reference level by number
- (position) or by key (label).
- axis : int
- Where to reorder levels.
- Returns
- -------
- DataFrame
- """
- axis = self._get_axis_number(axis)
- if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover
- raise TypeError("Can only reorder levels on a hierarchical axis.")
- result = self.copy()
- if axis == 0:
- result.index = result.index.reorder_levels(order)
- else:
- result.columns = result.columns.reorder_levels(order)
- return result
- # ----------------------------------------------------------------------
- # Arithmetic / combination related
- def _combine_frame(self, other, func, fill_value=None, level=None):
- # at this point we have `self._indexed_same(other)`
- if fill_value is None:
- # since _arith_op may be called in a loop, avoid function call
- # overhead if possible by doing this check once
- _arith_op = func
- else:
- def _arith_op(left, right):
- # for the mixed_type case where we iterate over columns,
- # _arith_op(left, right) is equivalent to
- # left._binop(right, func, fill_value=fill_value)
- left, right = ops.fill_binop(left, right, fill_value)
- return func(left, right)
- if ops.should_series_dispatch(self, other, func):
- # iterate over columns
- new_data = ops.dispatch_to_series(self, other, _arith_op)
- else:
- with np.errstate(all="ignore"):
- res_values = _arith_op(self.values, other.values)
- new_data = dispatch_fill_zeros(func, self.values, other.values, res_values)
- return new_data
- def _combine_match_index(self, other, func):
- # at this point we have `self.index.equals(other.index)`
- if ops.should_series_dispatch(self, other, func):
- # operate column-wise; avoid costly object-casting in `.values`
- new_data = ops.dispatch_to_series(self, other, func)
- else:
- # fastpath --> operate directly on values
- with np.errstate(all="ignore"):
- new_data = func(self.values.T, other.values).T
- return new_data
- def _construct_result(self, result) -> "DataFrame":
- """
- Wrap the result of an arithmetic, comparison, or logical operation.
- Parameters
- ----------
- result : DataFrame
- Returns
- -------
- DataFrame
- """
- out = self._constructor(result, index=self.index, copy=False)
- # Pin columns instead of passing to constructor for compat with
- # non-unique columns case
- out.columns = self.columns
- return out
- def combine(
- self, other: "DataFrame", func, fill_value=None, overwrite=True
- ) -> "DataFrame":
- """
- Perform column-wise combine with another DataFrame.
- Combines a DataFrame with `other` DataFrame using `func`
- to element-wise combine columns. The row and column indexes of the
- resulting DataFrame will be the union of the two.
- Parameters
- ----------
- other : DataFrame
- The DataFrame to merge column-wise.
- func : function
- Function that takes two series as inputs and return a Series or a
- scalar. Used to merge the two dataframes column by columns.
- fill_value : scalar value, default None
- The value to fill NaNs with prior to passing any column to the
- merge func.
- overwrite : bool, default True
- If True, columns in `self` that do not exist in `other` will be
- overwritten with NaNs.
- Returns
- -------
- DataFrame
- Combination of the provided DataFrames.
- See Also
- --------
- DataFrame.combine_first : Combine two DataFrame objects and default to
- non-null values in frame calling the method.
- Examples
- --------
- Combine using a simple function that chooses the smaller column.
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
- >>> df1.combine(df2, take_smaller)
- A B
- 0 0 3
- 1 0 3
- Example using a true element-wise combine function.
- >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> df1.combine(df2, np.minimum)
- A B
- 0 1 2
- 1 0 3
- Using `fill_value` fills Nones prior to passing the column to the
- merge function.
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> df1.combine(df2, take_smaller, fill_value=-5)
- A B
- 0 0 -5.0
- 1 0 4.0
- However, if the same element in both dataframes is None, that None
- is preserved
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
- >>> df1.combine(df2, take_smaller, fill_value=-5)
- A B
- 0 0 -5.0
- 1 0 3.0
- Example that demonstrates the use of `overwrite` and behavior when
- the axis differ between the dataframes.
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
- >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
- >>> df1.combine(df2, take_smaller)
- A B C
- 0 NaN NaN NaN
- 1 NaN 3.0 -10.0
- 2 NaN 3.0 1.0
- >>> df1.combine(df2, take_smaller, overwrite=False)
- A B C
- 0 0.0 NaN NaN
- 1 0.0 3.0 -10.0
- 2 NaN 3.0 1.0
- Demonstrating the preference of the passed in dataframe.
- >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
- >>> df2.combine(df1, take_smaller)
- A B C
- 0 0.0 NaN NaN
- 1 0.0 3.0 NaN
- 2 NaN 3.0 NaN
- >>> df2.combine(df1, take_smaller, overwrite=False)
- A B C
- 0 0.0 NaN NaN
- 1 0.0 3.0 1.0
- 2 NaN 3.0 1.0
- """
- other_idxlen = len(other.index) # save for compare
- this, other = self.align(other, copy=False)
- new_index = this.index
- if other.empty and len(new_index) == len(self.index):
- return self.copy()
- if self.empty and len(other) == other_idxlen:
- return other.copy()
- # sorts if possible
- new_columns = this.columns.union(other.columns)
- do_fill = fill_value is not None
- result = {}
- for col in new_columns:
- series = this[col]
- otherSeries = other[col]
- this_dtype = series.dtype
- other_dtype = otherSeries.dtype
- this_mask = isna(series)
- other_mask = isna(otherSeries)
- # don't overwrite columns unnecessarily
- # DO propagate if this column is not in the intersection
- if not overwrite and other_mask.all():
- result[col] = this[col].copy()
- continue
- if do_fill:
- series = series.copy()
- otherSeries = otherSeries.copy()
- series[this_mask] = fill_value
- otherSeries[other_mask] = fill_value
- if col not in self.columns:
- # If self DataFrame does not have col in other DataFrame,
- # try to promote series, which is all NaN, as other_dtype.
- new_dtype = other_dtype
- try:
- series = series.astype(new_dtype, copy=False)
- except ValueError:
- # e.g. new_dtype is integer types
- pass
- else:
- # if we have different dtypes, possibly promote
- new_dtype = find_common_type([this_dtype, other_dtype])
- if not is_dtype_equal(this_dtype, new_dtype):
- series = series.astype(new_dtype)
- if not is_dtype_equal(other_dtype, new_dtype):
- otherSeries = otherSeries.astype(new_dtype)
- arr = func(series, otherSeries)
- arr = maybe_downcast_to_dtype(arr, this_dtype)
- result[col] = arr
- # convert_objects just in case
- return self._constructor(result, index=new_index, columns=new_columns)
- def combine_first(self, other: "DataFrame") -> "DataFrame":
- """
- Update null elements with value in the same location in `other`.
- Combine two DataFrame objects by filling null values in one DataFrame
- with non-null values from other DataFrame. The row and column indexes
- of the resulting DataFrame will be the union of the two.
- Parameters
- ----------
- other : DataFrame
- Provided DataFrame to use to fill null values.
- Returns
- -------
- DataFrame
- See Also
- --------
- DataFrame.combine : Perform series-wise operation on two DataFrames
- using a given function.
- Examples
- --------
- >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> df1.combine_first(df2)
- A B
- 0 1.0 3.0
- 1 0.0 4.0
- Null values still persist if the location of that null value
- does not exist in `other`
- >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
- >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
- >>> df1.combine_first(df2)
- A B C
- 0 NaN 4.0 NaN
- 1 0.0 3.0 1.0
- 2 NaN 3.0 1.0
- """
- import pandas.core.computation.expressions as expressions
- def extract_values(arr):
- # Does two things:
- # 1. maybe gets the values from the Series / Index
- # 2. convert datelike to i8
- if isinstance(arr, (ABCIndexClass, ABCSeries)):
- arr = arr._values
- if needs_i8_conversion(arr):
- if is_extension_array_dtype(arr.dtype):
- arr = arr.asi8
- else:
- arr = arr.view("i8")
- return arr
- def combiner(x, y):
- mask = isna(x)
- if isinstance(mask, (ABCIndexClass, ABCSeries)):
- mask = mask._values
- x_values = extract_values(x)
- y_values = extract_values(y)
- # If the column y in other DataFrame is not in first DataFrame,
- # just return y_values.
- if y.name not in self.columns:
- return y_values
- return expressions.where(mask, y_values, x_values)
- return self.combine(other, combiner, overwrite=False)
- def update(
- self, other, join="left", overwrite=True, filter_func=None, errors="ignore"
- ) -> None:
- """
- Modify in place using non-NA values from another DataFrame.
- Aligns on indices. There is no return value.
- Parameters
- ----------
- other : DataFrame, or object coercible into a DataFrame
- Should have at least one matching index/column label
- with the original DataFrame. If a Series is passed,
- its name attribute must be set, and that will be
- used as the column name to align with the original DataFrame.
- join : {'left'}, default 'left'
- Only left join is implemented, keeping the index and columns of the
- original object.
- overwrite : bool, default True
- How to handle non-NA values for overlapping keys:
- * True: overwrite original DataFrame's values
- with values from `other`.
- * False: only update values that are NA in
- the original DataFrame.
- filter_func : callable(1d-array) -> bool 1d-array, optional
- Can choose to replace values other than NA. Return True for values
- that should be updated.
- errors : {'raise', 'ignore'}, default 'ignore'
- If 'raise', will raise a ValueError if the DataFrame and `other`
- both contain non-NA data in the same place.
- .. versionchanged:: 0.24.0
- Changed from `raise_conflict=False|True`
- to `errors='ignore'|'raise'`.
- Returns
- -------
- None : method directly changes calling object
- Raises
- ------
- ValueError
- * When `errors='raise'` and there's overlapping non-NA data.
- * When `errors` is not either `'ignore'` or `'raise'`
- NotImplementedError
- * If `join != 'left'`
- See Also
- --------
- dict.update : Similar method for dictionaries.
- DataFrame.merge : For column(s)-on-columns(s) operations.
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 2, 3],
- ... 'B': [400, 500, 600]})
- >>> new_df = pd.DataFrame({'B': [4, 5, 6],
- ... 'C': [7, 8, 9]})
- >>> df.update(new_df)
- >>> df
- A B
- 0 1 4
- 1 2 5
- 2 3 6
- The DataFrame's length does not increase as a result of the update,
- only values at matching index/column labels are updated.
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
- ... 'B': ['x', 'y', 'z']})
- >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
- >>> df.update(new_df)
- >>> df
- A B
- 0 a d
- 1 b e
- 2 c f
- For Series, it's name attribute must be set.
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
- ... 'B': ['x', 'y', 'z']})
- >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
- >>> df.update(new_column)
- >>> df
- A B
- 0 a d
- 1 b y
- 2 c e
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
- ... 'B': ['x', 'y', 'z']})
- >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
- >>> df.update(new_df)
- >>> df
- A B
- 0 a x
- 1 b d
- 2 c e
- If `other` contains NaNs the corresponding values are not updated
- in the original dataframe.
- >>> df = pd.DataFrame({'A': [1, 2, 3],
- ... 'B': [400, 500, 600]})
- >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
- >>> df.update(new_df)
- >>> df
- A B
- 0 1 4.0
- 1 2 500.0
- 2 3 6.0
- """
- import pandas.core.computation.expressions as expressions
- # TODO: Support other joins
- if join != "left": # pragma: no cover
- raise NotImplementedError("Only left join is supported")
- if errors not in ["ignore", "raise"]:
- raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
- if not isinstance(other, DataFrame):
- other = DataFrame(other)
- other = other.reindex_like(self)
- for col in self.columns:
- this = self[col]._values
- that = other[col]._values
- if filter_func is not None:
- with np.errstate(all="ignore"):
- mask = ~filter_func(this) | isna(that)
- else:
- if errors == "raise":
- mask_this = notna(that)
- mask_that = notna(this)
- if any(mask_this & mask_that):
- raise ValueError("Data overlaps.")
- if overwrite:
- mask = isna(that)
- else:
- mask = notna(this)
- # don't overwrite columns unnecessarily
- if mask.all():
- continue
- self[col] = expressions.where(mask, this, that)
- # ----------------------------------------------------------------------
- # Data reshaping
- @Appender(
- """
- Examples
- --------
- >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
- ... 'Parrot', 'Parrot'],
- ... 'Max Speed': [380., 370., 24., 26.]})
- >>> df
- Animal Max Speed
- 0 Falcon 380.0
- 1 Falcon 370.0
- 2 Parrot 24.0
- 3 Parrot 26.0
- >>> df.groupby(['Animal']).mean()
- Max Speed
- Animal
- Falcon 375.0
- Parrot 25.0
- **Hierarchical Indexes**
- We can groupby different levels of a hierarchical index
- using the `level` parameter:
- >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
- ... ['Captive', 'Wild', 'Captive', 'Wild']]
- >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
- >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
- ... index=index)
- >>> df
- Max Speed
- Animal Type
- Falcon Captive 390.0
- Wild 350.0
- Parrot Captive 30.0
- Wild 20.0
- >>> df.groupby(level=0).mean()
- Max Speed
- Animal
- Falcon 370.0
- Parrot 25.0
- >>> df.groupby(level="Type").mean()
- Max Speed
- Type
- Captive 210.0
- Wild 185.0
- """
- )
- @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
- def groupby(
- self,
- by=None,
- axis=0,
- level=None,
- as_index: bool = True,
- sort: bool = True,
- group_keys: bool = True,
- squeeze: bool = False,
- observed: bool = False,
- ) -> "groupby_generic.DataFrameGroupBy":
- if level is None and by is None:
- raise TypeError("You have to supply one of 'by' and 'level'")
- axis = self._get_axis_number(axis)
- return groupby_generic.DataFrameGroupBy(
- obj=self,
- keys=by,
- axis=axis,
- level=level,
- as_index=as_index,
- sort=sort,
- group_keys=group_keys,
- squeeze=squeeze,
- observed=observed,
- )
- _shared_docs[
- "pivot"
- ] = """
- Return reshaped DataFrame organized by given index / column values.
- Reshape data (produce a "pivot" table) based on column values. Uses
- unique values from specified `index` / `columns` to form axes of the
- resulting DataFrame. This function does not support data
- aggregation, multiple values will result in a MultiIndex in the
- columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
- Parameters
- ----------%s
- index : str or object, optional
- Column to use to make new frame's index. If None, uses
- existing index.
- columns : str or object
- Column to use to make new frame's columns.
- values : str, object or a list of the previous, optional
- Column(s) to use for populating new frame's values. If not
- specified, all remaining columns will be used and the result will
- have hierarchically indexed columns.
- .. versionchanged:: 0.23.0
- Also accept list of column names.
- Returns
- -------
- DataFrame
- Returns reshaped DataFrame.
- Raises
- ------
- ValueError:
- When there are any `index`, `columns` combinations with multiple
- values. `DataFrame.pivot_table` when you need to aggregate.
- See Also
- --------
- DataFrame.pivot_table : Generalization of pivot that can handle
- duplicate values for one index/column pair.
- DataFrame.unstack : Pivot based on the index values instead of a
- column.
- Notes
- -----
- For finer-tuned control, see hierarchical indexing documentation along
- with the related stack/unstack methods.
- Examples
- --------
- >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
- ... 'two'],
- ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
- ... 'baz': [1, 2, 3, 4, 5, 6],
- ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
- >>> df
- foo bar baz zoo
- 0 one A 1 x
- 1 one B 2 y
- 2 one C 3 z
- 3 two A 4 q
- 4 two B 5 w
- 5 two C 6 t
- >>> df.pivot(index='foo', columns='bar', values='baz')
- bar A B C
- foo
- one 1 2 3
- two 4 5 6
- >>> df.pivot(index='foo', columns='bar')['baz']
- bar A B C
- foo
- one 1 2 3
- two 4 5 6
- >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
- baz zoo
- bar A B C A B C
- foo
- one 1 2 3 x y z
- two 4 5 6 q w t
- A ValueError is raised if there are any duplicates.
- >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
- ... "bar": ['A', 'A', 'B', 'C'],
- ... "baz": [1, 2, 3, 4]})
- >>> df
- foo bar baz
- 0 one A 1
- 1 one A 2
- 2 two B 3
- 3 two C 4
- Notice that the first two rows are the same for our `index`
- and `columns` arguments.
- >>> df.pivot(index='foo', columns='bar', values='baz')
- Traceback (most recent call last):
- ...
- ValueError: Index contains duplicate entries, cannot reshape
- """
- @Substitution("")
- @Appender(_shared_docs["pivot"])
- def pivot(self, index=None, columns=None, values=None) -> "DataFrame":
- from pandas.core.reshape.pivot import pivot
- return pivot(self, index=index, columns=columns, values=values)
- _shared_docs[
- "pivot_table"
- ] = """
- Create a spreadsheet-style pivot table as a DataFrame.
- The levels in the pivot table will be stored in MultiIndex objects
- (hierarchical indexes) on the index and columns of the result DataFrame.
- Parameters
- ----------%s
- values : column to aggregate, optional
- index : column, Grouper, array, or list of the previous
- If an array is passed, it must be the same length as the data. The
- list can contain any of the other types (except list).
- Keys to group by on the pivot table index. If an array is passed,
- it is being used as the same manner as column values.
- columns : column, Grouper, array, or list of the previous
- If an array is passed, it must be the same length as the data. The
- list can contain any of the other types (except list).
- Keys to group by on the pivot table column. If an array is passed,
- it is being used as the same manner as column values.
- aggfunc : function, list of functions, dict, default numpy.mean
- If list of functions passed, the resulting pivot table will have
- hierarchical columns whose top level are the function names
- (inferred from the function objects themselves)
- If dict is passed, the key is column to aggregate and value
- is function or list of functions.
- fill_value : scalar, default None
- Value to replace missing values with.
- margins : bool, default False
- Add all row / columns (e.g. for subtotal / grand totals).
- dropna : bool, default True
- Do not include columns whose entries are all NaN.
- margins_name : str, default 'All'
- Name of the row / column that will contain the totals
- when margins is True.
- observed : bool, default False
- This only applies if any of the groupers are Categoricals.
- If True: only show observed values for categorical groupers.
- If False: show all values for categorical groupers.
- .. versionchanged:: 0.25.0
- Returns
- -------
- DataFrame
- An Excel style pivot table.
- See Also
- --------
- DataFrame.pivot : Pivot without aggregation that can handle
- non-numeric data.
- Examples
- --------
- >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
- ... "bar", "bar", "bar", "bar"],
- ... "B": ["one", "one", "one", "two", "two",
- ... "one", "one", "two", "two"],
- ... "C": ["small", "large", "large", "small",
- ... "small", "large", "small", "small",
- ... "large"],
- ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
- ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
- >>> df
- A B C D E
- 0 foo one small 1 2
- 1 foo one large 2 4
- 2 foo one large 2 5
- 3 foo two small 3 5
- 4 foo two small 3 6
- 5 bar one large 4 6
- 6 bar one small 5 8
- 7 bar two small 6 9
- 8 bar two large 7 9
- This first example aggregates values by taking the sum.
- >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
- ... columns=['C'], aggfunc=np.sum)
- >>> table
- C large small
- A B
- bar one 4.0 5.0
- two 7.0 6.0
- foo one 4.0 1.0
- two NaN 6.0
- We can also fill missing values using the `fill_value` parameter.
- >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
- ... columns=['C'], aggfunc=np.sum, fill_value=0)
- >>> table
- C large small
- A B
- bar one 4 5
- two 7 6
- foo one 4 1
- two 0 6
- The next example aggregates by taking the mean across multiple columns.
- >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
- ... aggfunc={'D': np.mean,
- ... 'E': np.mean})
- >>> table
- D E
- A C
- bar large 5.500000 7.500000
- small 5.500000 8.500000
- foo large 2.000000 4.500000
- small 2.333333 4.333333
- We can also calculate multiple types of aggregations for any given
- value column.
- >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
- ... aggfunc={'D': np.mean,
- ... 'E': [min, max, np.mean]})
- >>> table
- D E
- mean max mean min
- A C
- bar large 5.500000 9.0 7.500000 6.0
- small 5.500000 9.0 8.500000 8.0
- foo large 2.000000 5.0 4.500000 4.0
- small 2.333333 6.0 4.333333 2.0
- """
- @Substitution("")
- @Appender(_shared_docs["pivot_table"])
- def pivot_table(
- self,
- values=None,
- index=None,
- columns=None,
- aggfunc="mean",
- fill_value=None,
- margins=False,
- dropna=True,
- margins_name="All",
- observed=False,
- ) -> "DataFrame":
- from pandas.core.reshape.pivot import pivot_table
- return pivot_table(
- self,
- values=values,
- index=index,
- columns=columns,
- aggfunc=aggfunc,
- fill_value=fill_value,
- margins=margins,
- dropna=dropna,
- margins_name=margins_name,
- observed=observed,
- )
- def stack(self, level=-1, dropna=True):
- """
- Stack the prescribed level(s) from columns to index.
- Return a reshaped DataFrame or Series having a multi-level
- index with one or more new inner-most levels compared to the current
- DataFrame. The new inner-most levels are created by pivoting the
- columns of the current dataframe:
- - if the columns have a single level, the output is a Series;
- - if the columns have multiple levels, the new index
- level(s) is (are) taken from the prescribed level(s) and
- the output is a DataFrame.
- The new index levels are sorted.
- Parameters
- ----------
- level : int, str, list, default -1
- Level(s) to stack from the column axis onto the index
- axis, defined as one index or label, or a list of indices
- or labels.
- dropna : bool, default True
- Whether to drop rows in the resulting Frame/Series with
- missing values. Stacking a column level onto the index
- axis can create combinations of index and column values
- that are missing from the original dataframe. See Examples
- section.
- Returns
- -------
- DataFrame or Series
- Stacked dataframe or series.
- See Also
- --------
- DataFrame.unstack : Unstack prescribed level(s) from index axis
- onto column axis.
- DataFrame.pivot : Reshape dataframe from long format to wide
- format.
- DataFrame.pivot_table : Create a spreadsheet-style pivot table
- as a DataFrame.
- Notes
- -----
- The function is named by analogy with a collection of books
- being reorganized from being side by side on a horizontal
- position (the columns of the dataframe) to being stacked
- vertically on top of each other (in the index of the
- dataframe).
- Examples
- --------
- **Single level columns**
- >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
- ... index=['cat', 'dog'],
- ... columns=['weight', 'height'])
- Stacking a dataframe with a single level column axis returns a Series:
- >>> df_single_level_cols
- weight height
- cat 0 1
- dog 2 3
- >>> df_single_level_cols.stack()
- cat weight 0
- height 1
- dog weight 2
- height 3
- dtype: int64
- **Multi level columns: simple case**
- >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
- ... ('weight', 'pounds')])
- >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
- ... index=['cat', 'dog'],
- ... columns=multicol1)
- Stacking a dataframe with a multi-level column axis:
- >>> df_multi_level_cols1
- weight
- kg pounds
- cat 1 2
- dog 2 4
- >>> df_multi_level_cols1.stack()
- weight
- cat kg 1
- pounds 2
- dog kg 2
- pounds 4
- **Missing values**
- >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
- ... ('height', 'm')])
- >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
- ... index=['cat', 'dog'],
- ... columns=multicol2)
- It is common to have missing values when stacking a dataframe
- with multi-level columns, as the stacked dataframe typically
- has more values than the original dataframe. Missing values
- are filled with NaNs:
- >>> df_multi_level_cols2
- weight height
- kg m
- cat 1.0 2.0
- dog 3.0 4.0
- >>> df_multi_level_cols2.stack()
- height weight
- cat kg NaN 1.0
- m 2.0 NaN
- dog kg NaN 3.0
- m 4.0 NaN
- **Prescribing the level(s) to be stacked**
- The first parameter controls which level or levels are stacked:
- >>> df_multi_level_cols2.stack(0)
- kg m
- cat height NaN 2.0
- weight 1.0 NaN
- dog height NaN 4.0
- weight 3.0 NaN
- >>> df_multi_level_cols2.stack([0, 1])
- cat height m 2.0
- weight kg 1.0
- dog height m 4.0
- weight kg 3.0
- dtype: float64
- **Dropping missing values**
- >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
- ... index=['cat', 'dog'],
- ... columns=multicol2)
- Note that rows where all values are missing are dropped by
- default but this behaviour can be controlled via the dropna
- keyword parameter:
- >>> df_multi_level_cols3
- weight height
- kg m
- cat NaN 1.0
- dog 2.0 3.0
- >>> df_multi_level_cols3.stack(dropna=False)
- height weight
- cat kg NaN NaN
- m 1.0 NaN
- dog kg NaN 2.0
- m 3.0 NaN
- >>> df_multi_level_cols3.stack(dropna=True)
- height weight
- cat m 1.0 NaN
- dog kg NaN 2.0
- m 3.0 NaN
- """
- from pandas.core.reshape.reshape import stack, stack_multiple
- if isinstance(level, (tuple, list)):
- return stack_multiple(self, level, dropna=dropna)
- else:
- return stack(self, level, dropna=dropna)
- def explode(self, column: Union[str, Tuple]) -> "DataFrame":
- """
- Transform each element of a list-like to a row, replicating index values.
- .. versionadded:: 0.25.0
- Parameters
- ----------
- column : str or tuple
- Column to explode.
- Returns
- -------
- DataFrame
- Exploded lists to rows of the subset columns;
- index will be duplicated for these rows.
- Raises
- ------
- ValueError :
- if columns of the frame are not unique.
- See Also
- --------
- DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
- index labels.
- DataFrame.melt : Unpivot a DataFrame from wide format to long format.
- Series.explode : Explode a DataFrame from list-like columns to long format.
- Notes
- -----
- This routine will explode list-likes including lists, tuples,
- Series, and np.ndarray. The result dtype of the subset rows will
- be object. Scalars will be returned unchanged. Empty list-likes will
- result in a np.nan for that row.
- Examples
- --------
- >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
- >>> df
- A B
- 0 [1, 2, 3] 1
- 1 foo 1
- 2 [] 1
- 3 [3, 4] 1
- >>> df.explode('A')
- A B
- 0 1 1
- 0 2 1
- 0 3 1
- 1 foo 1
- 2 NaN 1
- 3 3 1
- 3 4 1
- """
- if not (is_scalar(column) or isinstance(column, tuple)):
- raise ValueError("column must be a scalar")
- if not self.columns.is_unique:
- raise ValueError("columns must be unique")
- df = self.reset_index(drop=True)
- # TODO: use overload to refine return type of reset_index
- assert df is not None # needed for mypy
- result = df[column].explode()
- result = df.drop([column], axis=1).join(result)
- result.index = self.index.take(result.index)
- result = result.reindex(columns=self.columns, copy=False)
- return result
- def unstack(self, level=-1, fill_value=None):
- """
- Pivot a level of the (necessarily hierarchical) index labels.
- Returns a DataFrame having a new level of column labels whose inner-most level
- consists of the pivoted index labels.
- If the index is not a MultiIndex, the output will be a Series
- (the analogue of stack when the columns are not a MultiIndex).
- The level involved will automatically get sorted.
- Parameters
- ----------
- level : int, str, or list of these, default -1 (last level)
- Level(s) of index to unstack, can pass level name.
- fill_value : int, str or dict
- Replace NaN with this value if the unstack produces missing values.
- Returns
- -------
- Series or DataFrame
- See Also
- --------
- DataFrame.pivot : Pivot a table based on column values.
- DataFrame.stack : Pivot a level of the column labels (inverse operation
- from `unstack`).
- Examples
- --------
- >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
- ... ('two', 'a'), ('two', 'b')])
- >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
- >>> s
- one a 1.0
- b 2.0
- two a 3.0
- b 4.0
- dtype: float64
- >>> s.unstack(level=-1)
- a b
- one 1.0 2.0
- two 3.0 4.0
- >>> s.unstack(level=0)
- one two
- a 1.0 3.0
- b 2.0 4.0
- >>> df = s.unstack(level=0)
- >>> df.unstack()
- one a 1.0
- b 2.0
- two a 3.0
- b 4.0
- dtype: float64
- """
- from pandas.core.reshape.reshape import unstack
- return unstack(self, level, fill_value)
- _shared_docs[
- "melt"
- ] = """
- Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
- This function is useful to massage a DataFrame into a format where one
- or more columns are identifier variables (`id_vars`), while all other
- columns, considered measured variables (`value_vars`), are "unpivoted" to
- the row axis, leaving just two non-identifier columns, 'variable' and
- 'value'.
- %(versionadded)s
- Parameters
- ----------
- id_vars : tuple, list, or ndarray, optional
- Column(s) to use as identifier variables.
- value_vars : tuple, list, or ndarray, optional
- Column(s) to unpivot. If not specified, uses all columns that
- are not set as `id_vars`.
- var_name : scalar
- Name to use for the 'variable' column. If None it uses
- ``frame.columns.name`` or 'variable'.
- value_name : scalar, default 'value'
- Name to use for the 'value' column.
- col_level : int or str, optional
- If columns are a MultiIndex then use this level to melt.
- Returns
- -------
- DataFrame
- Unpivoted DataFrame.
- See Also
- --------
- %(other)s
- pivot_table
- DataFrame.pivot
- Series.explode
- Examples
- --------
- >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
- ... 'B': {0: 1, 1: 3, 2: 5},
- ... 'C': {0: 2, 1: 4, 2: 6}})
- >>> df
- A B C
- 0 a 1 2
- 1 b 3 4
- 2 c 5 6
- >>> %(caller)sid_vars=['A'], value_vars=['B'])
- A variable value
- 0 a B 1
- 1 b B 3
- 2 c B 5
- >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
- A variable value
- 0 a B 1
- 1 b B 3
- 2 c B 5
- 3 a C 2
- 4 b C 4
- 5 c C 6
- The names of 'variable' and 'value' columns can be customized:
- >>> %(caller)sid_vars=['A'], value_vars=['B'],
- ... var_name='myVarname', value_name='myValname')
- A myVarname myValname
- 0 a B 1
- 1 b B 3
- 2 c B 5
- If you have multi-index columns:
- >>> df.columns = [list('ABC'), list('DEF')]
- >>> df
- A B C
- D E F
- 0 a 1 2
- 1 b 3 4
- 2 c 5 6
- >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
- A variable value
- 0 a B 1
- 1 b B 3
- 2 c B 5
- >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
- (A, D) variable_0 variable_1 value
- 0 a B E 1
- 1 b B E 3
- 2 c B E 5
- """
- @Appender(
- _shared_docs["melt"]
- % dict(
- caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt"
- )
- )
- def melt(
- self,
- id_vars=None,
- value_vars=None,
- var_name=None,
- value_name="value",
- col_level=None,
- ) -> "DataFrame":
- from pandas.core.reshape.melt import melt
- return melt(
- self,
- id_vars=id_vars,
- value_vars=value_vars,
- var_name=var_name,
- value_name=value_name,
- col_level=col_level,
- )
- # ----------------------------------------------------------------------
- # Time series-related
- def diff(self, periods=1, axis=0) -> "DataFrame":
- """
- First discrete difference of element.
- Calculates the difference of a DataFrame element compared with another
- element in the DataFrame (default is the element in the same column
- of the previous row).
- Parameters
- ----------
- periods : int, default 1
- Periods to shift for calculating difference, accepts negative
- values.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Take difference over rows (0) or columns (1).
- Returns
- -------
- DataFrame
- See Also
- --------
- Series.diff: First discrete difference for a Series.
- DataFrame.pct_change: Percent change over given number of periods.
- DataFrame.shift: Shift index by desired number of periods with an
- optional time freq.
- Notes
- -----
- For boolean dtypes, this uses :meth:`operator.xor` rather than
- :meth:`operator.sub`.
- Examples
- --------
- Difference with previous row
- >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
- ... 'b': [1, 1, 2, 3, 5, 8],
- ... 'c': [1, 4, 9, 16, 25, 36]})
- >>> df
- a b c
- 0 1 1 1
- 1 2 1 4
- 2 3 2 9
- 3 4 3 16
- 4 5 5 25
- 5 6 8 36
- >>> df.diff()
- a b c
- 0 NaN NaN NaN
- 1 1.0 0.0 3.0
- 2 1.0 1.0 5.0
- 3 1.0 1.0 7.0
- 4 1.0 2.0 9.0
- 5 1.0 3.0 11.0
- Difference with previous column
- >>> df.diff(axis=1)
- a b c
- 0 NaN 0.0 0.0
- 1 NaN -1.0 3.0
- 2 NaN -1.0 7.0
- 3 NaN -1.0 13.0
- 4 NaN 0.0 20.0
- 5 NaN 2.0 28.0
- Difference with 3rd previous row
- >>> df.diff(periods=3)
- a b c
- 0 NaN NaN NaN
- 1 NaN NaN NaN
- 2 NaN NaN NaN
- 3 3.0 2.0 15.0
- 4 3.0 4.0 21.0
- 5 3.0 6.0 27.0
- Difference with following row
- >>> df.diff(periods=-1)
- a b c
- 0 -1.0 0.0 -3.0
- 1 -1.0 -1.0 -5.0
- 2 -1.0 -1.0 -7.0
- 3 -1.0 -2.0 -9.0
- 4 -1.0 -3.0 -11.0
- 5 NaN NaN NaN
- """
- bm_axis = self._get_block_manager_axis(axis)
- new_data = self._data.diff(n=periods, axis=bm_axis)
- return self._constructor(new_data)
- # ----------------------------------------------------------------------
- # Function application
- def _gotitem(
- self,
- key: Union[str, List[str]],
- ndim: int,
- subset: Optional[Union[Series, ABCDataFrame]] = None,
- ) -> Union[Series, ABCDataFrame]:
- """
- Sub-classes to define. Return a sliced object.
- Parameters
- ----------
- key : string / list of selections
- ndim : 1,2
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- if subset is None:
- subset = self
- elif subset.ndim == 1: # is Series
- return subset
- # TODO: _shallow_copy(subset)?
- return subset[key]
- _agg_summary_and_see_also_doc = dedent(
- """
- The aggregation operations are always performed over an axis, either the
- index (default) or the column axis. This behavior is different from
- `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
- `var`), where the default is to compute the aggregation of the flattened
- array, e.g., ``numpy.mean(arr_2d)`` as opposed to
- ``numpy.mean(arr_2d, axis=0)``.
- `agg` is an alias for `aggregate`. Use the alias.
- See Also
- --------
- DataFrame.apply : Perform any type of operations.
- DataFrame.transform : Perform transformation type operations.
- core.groupby.GroupBy : Perform operations over groups.
- core.resample.Resampler : Perform operations over resampled bins.
- core.window.Rolling : Perform operations over rolling window.
- core.window.Expanding : Perform operations over expanding window.
- core.window.EWM : Perform operation over exponential weighted
- window.
- """
- )
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame([[1, 2, 3],
- ... [4, 5, 6],
- ... [7, 8, 9],
- ... [np.nan, np.nan, np.nan]],
- ... columns=['A', 'B', 'C'])
- Aggregate these functions over the rows.
- >>> df.agg(['sum', 'min'])
- A B C
- sum 12.0 15.0 18.0
- min 1.0 2.0 3.0
- Different aggregations per column.
- >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
- A B
- max NaN 8.0
- min 1.0 2.0
- sum 12.0 NaN
- Aggregate over the columns.
- >>> df.agg("mean", axis="columns")
- 0 2.0
- 1 5.0
- 2 8.0
- 3 NaN
- dtype: float64
- """
- )
- @Substitution(
- see_also=_agg_summary_and_see_also_doc,
- examples=_agg_examples_doc,
- versionadded="\n.. versionadded:: 0.20.0\n",
- **_shared_doc_kwargs,
- )
- @Appender(_shared_docs["aggregate"])
- def aggregate(self, func, axis=0, *args, **kwargs):
- axis = self._get_axis_number(axis)
- result = None
- try:
- result, how = self._aggregate(func, axis=axis, *args, **kwargs)
- except TypeError:
- pass
- if result is None:
- return self.apply(func, axis=axis, args=args, **kwargs)
- return result
- def _aggregate(self, arg, axis=0, *args, **kwargs):
- if axis == 1:
- # NDFrame.aggregate returns a tuple, and we need to transpose
- # only result
- result, how = self.T._aggregate(arg, *args, **kwargs)
- result = result.T if result is not None else result
- return result, how
- return super()._aggregate(arg, *args, **kwargs)
- agg = aggregate
- @Appender(_shared_docs["transform"] % _shared_doc_kwargs)
- def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame":
- axis = self._get_axis_number(axis)
- if axis == 1:
- return self.T.transform(func, *args, **kwargs).T
- return super().transform(func, *args, **kwargs)
- def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
- """
- Apply a function along an axis of the DataFrame.
- Objects passed to the function are Series objects whose index is
- either the DataFrame's index (``axis=0``) or the DataFrame's columns
- (``axis=1``). By default (``result_type=None``), the final return type
- is inferred from the return type of the applied function. Otherwise,
- it depends on the `result_type` argument.
- Parameters
- ----------
- func : function
- Function to apply to each column or row.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Axis along which the function is applied:
- * 0 or 'index': apply function to each column.
- * 1 or 'columns': apply function to each row.
- raw : bool, default False
- Determines if row or column is passed as a Series or ndarray object:
- * ``False`` : passes each row or column as a Series to the
- function.
- * ``True`` : the passed function will receive ndarray objects
- instead.
- If you are just applying a NumPy reduction function this will
- achieve much better performance.
- result_type : {'expand', 'reduce', 'broadcast', None}, default None
- These only act when ``axis=1`` (columns):
- * 'expand' : list-like results will be turned into columns.
- * 'reduce' : returns a Series if possible rather than expanding
- list-like results. This is the opposite of 'expand'.
- * 'broadcast' : results will be broadcast to the original shape
- of the DataFrame, the original index and columns will be
- retained.
- The default behaviour (None) depends on the return value of the
- applied function: list-like results will be returned as a Series
- of those. However if the apply function returns a Series these
- are expanded to columns.
- .. versionadded:: 0.23.0
- args : tuple
- Positional arguments to pass to `func` in addition to the
- array/series.
- **kwds
- Additional keyword arguments to pass as keywords arguments to
- `func`.
- Returns
- -------
- Series or DataFrame
- Result of applying ``func`` along the given axis of the
- DataFrame.
- See Also
- --------
- DataFrame.applymap: For elementwise operations.
- DataFrame.aggregate: Only perform aggregating type operations.
- DataFrame.transform: Only perform transforming type operations.
- Examples
- --------
- >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
- >>> df
- A B
- 0 4 9
- 1 4 9
- 2 4 9
- Using a numpy universal function (in this case the same as
- ``np.sqrt(df)``):
- >>> df.apply(np.sqrt)
- A B
- 0 2.0 3.0
- 1 2.0 3.0
- 2 2.0 3.0
- Using a reducing function on either axis
- >>> df.apply(np.sum, axis=0)
- A 12
- B 27
- dtype: int64
- >>> df.apply(np.sum, axis=1)
- 0 13
- 1 13
- 2 13
- dtype: int64
- Returning a list-like will result in a Series
- >>> df.apply(lambda x: [1, 2], axis=1)
- 0 [1, 2]
- 1 [1, 2]
- 2 [1, 2]
- dtype: object
- Passing result_type='expand' will expand list-like results
- to columns of a Dataframe
- >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
- 0 1
- 0 1 2
- 1 1 2
- 2 1 2
- Returning a Series inside the function is similar to passing
- ``result_type='expand'``. The resulting column names
- will be the Series index.
- >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
- foo bar
- 0 1 2
- 1 1 2
- 2 1 2
- Passing ``result_type='broadcast'`` will ensure the same shape
- result, whether list-like or scalar is returned by the function,
- and broadcast it along the axis. The resulting column names will
- be the originals.
- >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
- A B
- 0 1 2
- 1 1 2
- 2 1 2
- """
- from pandas.core.apply import frame_apply
- op = frame_apply(
- self,
- func=func,
- axis=axis,
- raw=raw,
- result_type=result_type,
- args=args,
- kwds=kwds,
- )
- return op.get_result()
- def applymap(self, func) -> "DataFrame":
- """
- Apply a function to a Dataframe elementwise.
- This method applies a function that accepts and returns a scalar
- to every element of a DataFrame.
- Parameters
- ----------
- func : callable
- Python function, returns a single value from a single value.
- Returns
- -------
- DataFrame
- Transformed DataFrame.
- See Also
- --------
- DataFrame.apply : Apply a function along input axis of DataFrame.
- Notes
- -----
- In the current implementation applymap calls `func` twice on the
- first column/row to decide whether it can take a fast or slow
- code path. This can lead to unexpected behavior if `func` has
- side-effects, as they will take effect twice for the first
- column/row.
- Examples
- --------
- >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
- >>> df
- 0 1
- 0 1.000 2.120
- 1 3.356 4.567
- >>> df.applymap(lambda x: len(str(x)))
- 0 1
- 0 3 4
- 1 5 5
- Note that a vectorized version of `func` often exists, which will
- be much faster. You could square each number elementwise.
- >>> df.applymap(lambda x: x**2)
- 0 1
- 0 1.000000 4.494400
- 1 11.262736 20.857489
- But it's better to avoid applymap in that case.
- >>> df ** 2
- 0 1
- 0 1.000000 4.494400
- 1 11.262736 20.857489
- """
- # if we have a dtype == 'M8[ns]', provide boxed values
- def infer(x):
- if x.empty:
- return lib.map_infer(x, func)
- return lib.map_infer(x.astype(object).values, func)
- return self.apply(infer)
- # ----------------------------------------------------------------------
- # Merging / joining methods
- def append(
- self, other, ignore_index=False, verify_integrity=False, sort=False
- ) -> "DataFrame":
- """
- Append rows of `other` to the end of caller, returning a new object.
- Columns in `other` that are not in the caller are added as new columns.
- Parameters
- ----------
- other : DataFrame or Series/dict-like object, or list of these
- The data to append.
- ignore_index : bool, default False
- If True, do not use the index labels.
- verify_integrity : bool, default False
- If True, raise ValueError on creating index with duplicates.
- sort : bool, default False
- Sort columns if the columns of `self` and `other` are not aligned.
- .. versionadded:: 0.23.0
- .. versionchanged:: 1.0.0
- Changed to not sort by default.
- Returns
- -------
- DataFrame
- See Also
- --------
- concat : General function to concatenate DataFrame or Series objects.
- Notes
- -----
- If a list of dict/series is passed and the keys are all contained in
- the DataFrame's index, the order of the columns in the resulting
- DataFrame will be unchanged.
- Iteratively appending rows to a DataFrame can be more computationally
- intensive than a single concatenate. A better solution is to append
- those rows to a list and then concatenate the list with the original
- DataFrame all at once.
- Examples
- --------
- >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
- >>> df
- A B
- 0 1 2
- 1 3 4
- >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
- >>> df.append(df2)
- A B
- 0 1 2
- 1 3 4
- 0 5 6
- 1 7 8
- With `ignore_index` set to True:
- >>> df.append(df2, ignore_index=True)
- A B
- 0 1 2
- 1 3 4
- 2 5 6
- 3 7 8
- The following, while not recommended methods for generating DataFrames,
- show two ways to generate a DataFrame from multiple data sources.
- Less efficient:
- >>> df = pd.DataFrame(columns=['A'])
- >>> for i in range(5):
- ... df = df.append({'A': i}, ignore_index=True)
- >>> df
- A
- 0 0
- 1 1
- 2 2
- 3 3
- 4 4
- More efficient:
- >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
- ... ignore_index=True)
- A
- 0 0
- 1 1
- 2 2
- 3 3
- 4 4
- """
- if isinstance(other, (Series, dict)):
- if isinstance(other, dict):
- other = Series(other)
- if other.name is None and not ignore_index:
- raise TypeError(
- "Can only append a Series if ignore_index=True "
- "or if the Series has a name"
- )
- index = Index([other.name], name=self.index.name)
- idx_diff = other.index.difference(self.columns)
- try:
- combined_columns = self.columns.append(idx_diff)
- except TypeError:
- combined_columns = self.columns.astype(object).append(idx_diff)
- other = (
- other.reindex(combined_columns, copy=False)
- .to_frame()
- .T.infer_objects()
- .rename_axis(index.names, copy=False)
- )
- if not self.columns.equals(combined_columns):
- self = self.reindex(columns=combined_columns)
- elif isinstance(other, list):
- if not other:
- pass
- elif not isinstance(other[0], DataFrame):
- other = DataFrame(other)
- if (self.columns.get_indexer(other.columns) >= 0).all():
- other = other.reindex(columns=self.columns)
- from pandas.core.reshape.concat import concat
- if isinstance(other, (list, tuple)):
- to_concat = [self, *other]
- else:
- to_concat = [self, other]
- return concat(
- to_concat,
- ignore_index=ignore_index,
- verify_integrity=verify_integrity,
- sort=sort,
- )
- def join(
- self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
- ) -> "DataFrame":
- """
- Join columns of another DataFrame.
- Join columns with `other` DataFrame either on index or on a key
- column. Efficiently join multiple DataFrame objects by index at once by
- passing a list.
- Parameters
- ----------
- other : DataFrame, Series, or list of DataFrame
- Index should be similar to one of the columns in this one. If a
- Series is passed, its name attribute must be set, and that will be
- used as the column name in the resulting joined DataFrame.
- on : str, list of str, or array-like, optional
- Column or index level name(s) in the caller to join on the index
- in `other`, otherwise joins index-on-index. If multiple
- values given, the `other` DataFrame must have a MultiIndex. Can
- pass an array as the join key if it is not already contained in
- the calling DataFrame. Like an Excel VLOOKUP operation.
- how : {'left', 'right', 'outer', 'inner'}, default 'left'
- How to handle the operation of the two objects.
- * left: use calling frame's index (or column if on is specified)
- * right: use `other`'s index.
- * outer: form union of calling frame's index (or column if on is
- specified) with `other`'s index, and sort it.
- lexicographically.
- * inner: form intersection of calling frame's index (or column if
- on is specified) with `other`'s index, preserving the order
- of the calling's one.
- lsuffix : str, default ''
- Suffix to use from left frame's overlapping columns.
- rsuffix : str, default ''
- Suffix to use from right frame's overlapping columns.
- sort : bool, default False
- Order result DataFrame lexicographically by the join key. If False,
- the order of the join key depends on the join type (how keyword).
- Returns
- -------
- DataFrame
- A dataframe containing columns from both the caller and `other`.
- See Also
- --------
- DataFrame.merge : For column(s)-on-columns(s) operations.
- Notes
- -----
- Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
- passing a list of `DataFrame` objects.
- Support for specifying index levels as the `on` parameter was added
- in version 0.23.0.
- Examples
- --------
- >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
- ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
- >>> df
- key A
- 0 K0 A0
- 1 K1 A1
- 2 K2 A2
- 3 K3 A3
- 4 K4 A4
- 5 K5 A5
- >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
- ... 'B': ['B0', 'B1', 'B2']})
- >>> other
- key B
- 0 K0 B0
- 1 K1 B1
- 2 K2 B2
- Join DataFrames using their indexes.
- >>> df.join(other, lsuffix='_caller', rsuffix='_other')
- key_caller A key_other B
- 0 K0 A0 K0 B0
- 1 K1 A1 K1 B1
- 2 K2 A2 K2 B2
- 3 K3 A3 NaN NaN
- 4 K4 A4 NaN NaN
- 5 K5 A5 NaN NaN
- If we want to join using the key columns, we need to set key to be
- the index in both `df` and `other`. The joined DataFrame will have
- key as its index.
- >>> df.set_index('key').join(other.set_index('key'))
- A B
- key
- K0 A0 B0
- K1 A1 B1
- K2 A2 B2
- K3 A3 NaN
- K4 A4 NaN
- K5 A5 NaN
- Another option to join using the key columns is to use the `on`
- parameter. DataFrame.join always uses `other`'s index but we can use
- any column in `df`. This method preserves the original DataFrame's
- index in the result.
- >>> df.join(other.set_index('key'), on='key')
- key A B
- 0 K0 A0 B0
- 1 K1 A1 B1
- 2 K2 A2 B2
- 3 K3 A3 NaN
- 4 K4 A4 NaN
- 5 K5 A5 NaN
- """
- return self._join_compat(
- other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort
- )
- def _join_compat(
- self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
- ):
- from pandas.core.reshape.merge import merge
- from pandas.core.reshape.concat import concat
- if isinstance(other, Series):
- if other.name is None:
- raise ValueError("Other Series must have a name")
- other = DataFrame({other.name: other})
- if isinstance(other, DataFrame):
- return merge(
- self,
- other,
- left_on=on,
- how=how,
- left_index=on is None,
- right_index=True,
- suffixes=(lsuffix, rsuffix),
- sort=sort,
- )
- else:
- if on is not None:
- raise ValueError(
- "Joining multiple DataFrames only supported for joining on index"
- )
- frames = [self] + list(other)
- can_concat = all(df.index.is_unique for df in frames)
- # join indexes only using concat
- if can_concat:
- if how == "left":
- res = concat(
- frames, axis=1, join="outer", verify_integrity=True, sort=sort
- )
- return res.reindex(self.index, copy=False)
- else:
- return concat(
- frames, axis=1, join=how, verify_integrity=True, sort=sort
- )
- joined = frames[0]
- for frame in frames[1:]:
- joined = merge(
- joined, frame, how=how, left_index=True, right_index=True
- )
- return joined
- @Substitution("")
- @Appender(_merge_doc, indents=2)
- def merge(
- self,
- right,
- how="inner",
- on=None,
- left_on=None,
- right_on=None,
- left_index=False,
- right_index=False,
- sort=False,
- suffixes=("_x", "_y"),
- copy=True,
- indicator=False,
- validate=None,
- ) -> "DataFrame":
- from pandas.core.reshape.merge import merge
- return merge(
- self,
- right,
- how=how,
- on=on,
- left_on=left_on,
- right_on=right_on,
- left_index=left_index,
- right_index=right_index,
- sort=sort,
- suffixes=suffixes,
- copy=copy,
- indicator=indicator,
- validate=validate,
- )
- def round(self, decimals=0, *args, **kwargs) -> "DataFrame":
- """
- Round a DataFrame to a variable number of decimal places.
- Parameters
- ----------
- decimals : int, dict, Series
- Number of decimal places to round each column to. If an int is
- given, round each column to the same number of places.
- Otherwise dict and Series round to variable numbers of places.
- Column names should be in the keys if `decimals` is a
- dict-like, or in the index if `decimals` is a Series. Any
- columns not included in `decimals` will be left as is. Elements
- of `decimals` which are not columns of the input will be
- ignored.
- *args
- Additional keywords have no effect but might be accepted for
- compatibility with numpy.
- **kwargs
- Additional keywords have no effect but might be accepted for
- compatibility with numpy.
- Returns
- -------
- DataFrame
- A DataFrame with the affected columns rounded to the specified
- number of decimal places.
- See Also
- --------
- numpy.around : Round a numpy array to the given number of decimals.
- Series.round : Round a Series to the given number of decimals.
- Examples
- --------
- >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
- ... columns=['dogs', 'cats'])
- >>> df
- dogs cats
- 0 0.21 0.32
- 1 0.01 0.67
- 2 0.66 0.03
- 3 0.21 0.18
- By providing an integer each column is rounded to the same number
- of decimal places
- >>> df.round(1)
- dogs cats
- 0 0.2 0.3
- 1 0.0 0.7
- 2 0.7 0.0
- 3 0.2 0.2
- With a dict, the number of places for specific columns can be
- specified with the column names as key and the number of decimal
- places as value
- >>> df.round({'dogs': 1, 'cats': 0})
- dogs cats
- 0 0.2 0.0
- 1 0.0 1.0
- 2 0.7 0.0
- 3 0.2 0.0
- Using a Series, the number of places for specific columns can be
- specified with the column names as index and the number of
- decimal places as value
- >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
- >>> df.round(decimals)
- dogs cats
- 0 0.2 0.0
- 1 0.0 1.0
- 2 0.7 0.0
- 3 0.2 0.0
- """
- from pandas.core.reshape.concat import concat
- def _dict_round(df, decimals):
- for col, vals in df.items():
- try:
- yield _series_round(vals, decimals[col])
- except KeyError:
- yield vals
- def _series_round(s, decimals):
- if is_integer_dtype(s) or is_float_dtype(s):
- return s.round(decimals)
- return s
- nv.validate_round(args, kwargs)
- if isinstance(decimals, (dict, Series)):
- if isinstance(decimals, Series):
- if not decimals.index.is_unique:
- raise ValueError("Index of decimals must be unique")
- new_cols = list(_dict_round(self, decimals))
- elif is_integer(decimals):
- # Dispatch to Series.round
- new_cols = [_series_round(v, decimals) for _, v in self.items()]
- else:
- raise TypeError("decimals must be an integer, a dict-like or a Series")
- if len(new_cols) > 0:
- return self._constructor(
- concat(new_cols, axis=1), index=self.index, columns=self.columns
- )
- else:
- return self
- # ----------------------------------------------------------------------
- # Statistical methods, etc.
- def corr(self, method="pearson", min_periods=1) -> "DataFrame":
- """
- Compute pairwise correlation of columns, excluding NA/null values.
- Parameters
- ----------
- method : {'pearson', 'kendall', 'spearman'} or callable
- Method of correlation:
- * pearson : standard correlation coefficient
- * kendall : Kendall Tau correlation coefficient
- * spearman : Spearman rank correlation
- * callable: callable with input two 1d ndarrays
- and returning a float. Note that the returned matrix from corr
- will have 1 along the diagonals and will be symmetric
- regardless of the callable's behavior.
- .. versionadded:: 0.24.0
- min_periods : int, optional
- Minimum number of observations required per pair of columns
- to have a valid result. Currently only available for Pearson
- and Spearman correlation.
- Returns
- -------
- DataFrame
- Correlation matrix.
- See Also
- --------
- DataFrame.corrwith
- Series.corr
- Examples
- --------
- >>> def histogram_intersection(a, b):
- ... v = np.minimum(a, b).sum().round(decimals=1)
- ... return v
- >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
- ... columns=['dogs', 'cats'])
- >>> df.corr(method=histogram_intersection)
- dogs cats
- dogs 1.0 0.3
- cats 0.3 1.0
- """
- numeric_df = self._get_numeric_data()
- cols = numeric_df.columns
- idx = cols.copy()
- mat = numeric_df.values
- if method == "pearson":
- correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
- elif method == "spearman":
- correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods)
- elif method == "kendall" or callable(method):
- if min_periods is None:
- min_periods = 1
- mat = ensure_float64(mat).T
- corrf = nanops.get_corr_func(method)
- K = len(cols)
- correl = np.empty((K, K), dtype=float)
- mask = np.isfinite(mat)
- for i, ac in enumerate(mat):
- for j, bc in enumerate(mat):
- if i > j:
- continue
- valid = mask[i] & mask[j]
- if valid.sum() < min_periods:
- c = np.nan
- elif i == j:
- c = 1.0
- elif not valid.all():
- c = corrf(ac[valid], bc[valid])
- else:
- c = corrf(ac, bc)
- correl[i, j] = c
- correl[j, i] = c
- else:
- raise ValueError(
- "method must be either 'pearson', "
- "'spearman', 'kendall', or a callable, "
- f"'{method}' was supplied"
- )
- return self._constructor(correl, index=idx, columns=cols)
- def cov(self, min_periods=None) -> "DataFrame":
- """
- Compute pairwise covariance of columns, excluding NA/null values.
- Compute the pairwise covariance among the series of a DataFrame.
- The returned data frame is the `covariance matrix
- <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
- of the DataFrame.
- Both NA and null values are automatically excluded from the
- calculation. (See the note below about bias from missing values.)
- A threshold can be set for the minimum number of
- observations for each value created. Comparisons with observations
- below this threshold will be returned as ``NaN``.
- This method is generally used for the analysis of time series data to
- understand the relationship between different measures
- across time.
- Parameters
- ----------
- min_periods : int, optional
- Minimum number of observations required per pair of columns
- to have a valid result.
- Returns
- -------
- DataFrame
- The covariance matrix of the series of the DataFrame.
- See Also
- --------
- Series.cov : Compute covariance with another Series.
- core.window.EWM.cov: Exponential weighted sample covariance.
- core.window.Expanding.cov : Expanding sample covariance.
- core.window.Rolling.cov : Rolling sample covariance.
- Notes
- -----
- Returns the covariance matrix of the DataFrame's time series.
- The covariance is normalized by N-1.
- For DataFrames that have Series that are missing data (assuming that
- data is `missing at random
- <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
- the returned covariance matrix will be an unbiased estimate
- of the variance and covariance between the member Series.
- However, for many applications this estimate may not be acceptable
- because the estimate covariance matrix is not guaranteed to be positive
- semi-definite. This could lead to estimate correlations having
- absolute values which are greater than one, and/or a non-invertible
- covariance matrix. See `Estimation of covariance matrices
- <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
- matrices>`__ for more details.
- Examples
- --------
- >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
- ... columns=['dogs', 'cats'])
- >>> df.cov()
- dogs cats
- dogs 0.666667 -1.000000
- cats -1.000000 1.666667
- >>> np.random.seed(42)
- >>> df = pd.DataFrame(np.random.randn(1000, 5),
- ... columns=['a', 'b', 'c', 'd', 'e'])
- >>> df.cov()
- a b c d e
- a 0.998438 -0.020161 0.059277 -0.008943 0.014144
- b -0.020161 1.059352 -0.008543 -0.024738 0.009826
- c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
- d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
- e 0.014144 0.009826 -0.000271 -0.013692 0.977795
- **Minimum number of periods**
- This method also supports an optional ``min_periods`` keyword
- that specifies the required minimum number of non-NA observations for
- each column pair in order to have a valid result:
- >>> np.random.seed(42)
- >>> df = pd.DataFrame(np.random.randn(20, 3),
- ... columns=['a', 'b', 'c'])
- >>> df.loc[df.index[:5], 'a'] = np.nan
- >>> df.loc[df.index[5:10], 'b'] = np.nan
- >>> df.cov(min_periods=12)
- a b c
- a 0.316741 NaN -0.150812
- b NaN 1.248003 0.191417
- c -0.150812 0.191417 0.895202
- """
- numeric_df = self._get_numeric_data()
- cols = numeric_df.columns
- idx = cols.copy()
- mat = numeric_df.values
- if notna(mat).all():
- if min_periods is not None and min_periods > len(mat):
- baseCov = np.empty((mat.shape[1], mat.shape[1]))
- baseCov.fill(np.nan)
- else:
- baseCov = np.cov(mat.T)
- baseCov = baseCov.reshape((len(cols), len(cols)))
- else:
- baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods)
- return self._constructor(baseCov, index=idx, columns=cols)
- def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series:
- """
- Compute pairwise correlation.
- Pairwise correlation is computed between rows or columns of
- DataFrame with rows or columns of Series or DataFrame. DataFrames
- are first aligned along both axes before computing the
- correlations.
- Parameters
- ----------
- other : DataFrame, Series
- Object with which to compute correlations.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for
- row-wise.
- drop : bool, default False
- Drop missing indices from result.
- method : {'pearson', 'kendall', 'spearman'} or callable
- Method of correlation:
- * pearson : standard correlation coefficient
- * kendall : Kendall Tau correlation coefficient
- * spearman : Spearman rank correlation
- * callable: callable with input two 1d ndarrays
- and returning a float.
- .. versionadded:: 0.24.0
- Returns
- -------
- Series
- Pairwise correlations.
- See Also
- --------
- DataFrame.corr
- """
- axis = self._get_axis_number(axis)
- this = self._get_numeric_data()
- if isinstance(other, Series):
- return this.apply(lambda x: other.corr(x, method=method), axis=axis)
- other = other._get_numeric_data()
- left, right = this.align(other, join="inner", copy=False)
- if axis == 1:
- left = left.T
- right = right.T
- if method == "pearson":
- # mask missing values
- left = left + right * 0
- right = right + left * 0
- # demeaned data
- ldem = left - left.mean()
- rdem = right - right.mean()
- num = (ldem * rdem).sum()
- dom = (left.count() - 1) * left.std() * right.std()
- correl = num / dom
- elif method in ["kendall", "spearman"] or callable(method):
- def c(x):
- return nanops.nancorr(x[0], x[1], method=method)
- correl = Series(
- map(c, zip(left.values.T, right.values.T)), index=left.columns
- )
- else:
- raise ValueError(
- f"Invalid method {method} was passed, "
- "valid methods are: 'pearson', 'kendall', "
- "'spearman', or callable"
- )
- if not drop:
- # Find non-matching labels along the given axis
- # and append missing correlations (GH 22375)
- raxis = 1 if axis == 0 else 0
- result_index = this._get_axis(raxis).union(other._get_axis(raxis))
- idx_diff = result_index.difference(correl.index)
- if len(idx_diff) > 0:
- correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff))
- return correl
- # ----------------------------------------------------------------------
- # ndarray-like stats methods
- def count(self, axis=0, level=None, numeric_only=False):
- """
- Count non-NA cells for each column or row.
- The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
- on `pandas.options.mode.use_inf_as_na`) are considered NA.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- If 0 or 'index' counts are generated for each column.
- If 1 or 'columns' counts are generated for each **row**.
- level : int or str, optional
- If the axis is a `MultiIndex` (hierarchical), count along a
- particular `level`, collapsing into a `DataFrame`.
- A `str` specifies the level name.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- Returns
- -------
- Series or DataFrame
- For each column/row the number of non-NA/null entries.
- If `level` is specified returns a `DataFrame`.
- See Also
- --------
- Series.count: Number of non-NA elements in a Series.
- DataFrame.shape: Number of DataFrame rows and columns (including NA
- elements).
- DataFrame.isna: Boolean same-sized DataFrame showing places of NA
- elements.
- Examples
- --------
- Constructing DataFrame from a dictionary:
- >>> df = pd.DataFrame({"Person":
- ... ["John", "Myla", "Lewis", "John", "Myla"],
- ... "Age": [24., np.nan, 21., 33, 26],
- ... "Single": [False, True, True, True, False]})
- >>> df
- Person Age Single
- 0 John 24.0 False
- 1 Myla NaN True
- 2 Lewis 21.0 True
- 3 John 33.0 True
- 4 Myla 26.0 False
- Notice the uncounted NA values:
- >>> df.count()
- Person 5
- Age 4
- Single 5
- dtype: int64
- Counts for each **row**:
- >>> df.count(axis='columns')
- 0 3
- 1 2
- 2 3
- 3 3
- 4 3
- dtype: int64
- Counts for one level of a `MultiIndex`:
- >>> df.set_index(["Person", "Single"]).count(level="Person")
- Age
- Person
- John 2
- Lewis 1
- Myla 1
- """
- axis = self._get_axis_number(axis)
- if level is not None:
- return self._count_level(level, axis=axis, numeric_only=numeric_only)
- if numeric_only:
- frame = self._get_numeric_data()
- else:
- frame = self
- # GH #423
- if len(frame._get_axis(axis)) == 0:
- result = Series(0, index=frame._get_agg_axis(axis))
- else:
- if frame._is_mixed_type or frame._data.any_extension_types:
- # the or any_extension_types is really only hit for single-
- # column frames with an extension array
- result = notna(frame).sum(axis=axis)
- else:
- # GH13407
- series_counts = notna(frame).sum(axis=axis)
- counts = series_counts.values
- result = Series(counts, index=frame._get_agg_axis(axis))
- return result.astype("int64")
- def _count_level(self, level, axis=0, numeric_only=False):
- if numeric_only:
- frame = self._get_numeric_data()
- else:
- frame = self
- count_axis = frame._get_axis(axis)
- agg_axis = frame._get_agg_axis(axis)
- if not isinstance(count_axis, ABCMultiIndex):
- raise TypeError(
- f"Can only count levels on hierarchical {self._get_axis_name(axis)}."
- )
- if frame._is_mixed_type:
- # Since we have mixed types, calling notna(frame.values) might
- # upcast everything to object
- mask = notna(frame).values
- else:
- # But use the speedup when we have homogeneous dtypes
- mask = notna(frame.values)
- if axis == 1:
- # We're transposing the mask rather than frame to avoid potential
- # upcasts to object, which induces a ~20x slowdown
- mask = mask.T
- if isinstance(level, str):
- level = count_axis._get_level_number(level)
- level_name = count_axis._names[level]
- level_index = count_axis.levels[level]._shallow_copy(name=level_name)
- level_codes = ensure_int64(count_axis.codes[level])
- counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0)
- result = DataFrame(counts, index=level_index, columns=agg_axis)
- if axis == 1:
- # Undo our earlier transpose
- return result.T
- else:
- return result
- def _reduce(
- self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
- ):
- if axis is None and filter_type == "bool":
- labels = None
- constructor = None
- else:
- # TODO: Make other agg func handle axis=None properly
- axis = self._get_axis_number(axis)
- labels = self._get_agg_axis(axis)
- constructor = self._constructor
- def f(x):
- return op(x, axis=axis, skipna=skipna, **kwds)
- def _get_data(axis_matters):
- if filter_type is None or filter_type == "numeric":
- data = self._get_numeric_data()
- elif filter_type == "bool":
- if axis_matters:
- # GH#25101, GH#24434
- data = self._get_bool_data() if axis == 0 else self
- else:
- data = self._get_bool_data()
- else: # pragma: no cover
- msg = (
- f"Generating numeric_only data with filter_type {filter_type} "
- "not supported."
- )
- raise NotImplementedError(msg)
- return data
- if numeric_only is not None and axis in [0, 1]:
- df = self
- if numeric_only is True:
- df = _get_data(axis_matters=True)
- if axis == 1:
- df = df.T
- axis = 0
- out_dtype = "bool" if filter_type == "bool" else None
- # After possibly _get_data and transposing, we are now in the
- # simple case where we can use BlockManager._reduce
- res = df._data.reduce(op, axis=1, skipna=skipna, **kwds)
- assert isinstance(res, dict)
- if len(res):
- assert len(res) == max(list(res.keys())) + 1, res.keys()
- out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype)
- out.index = df.columns
- return out
- if numeric_only is None:
- values = self.values
- try:
- result = f(values)
- if filter_type == "bool" and is_object_dtype(values) and axis is None:
- # work around https://github.com/numpy/numpy/issues/10489
- # TODO: combine with hasattr(result, 'dtype') further down
- # hard since we don't have `values` down there.
- result = np.bool_(result)
- except TypeError:
- # e.g. in nanops trying to convert strs to float
- # try by-column first
- if filter_type is None and axis == 0:
- # this can end up with a non-reduction
- # but not always. if the types are mixed
- # with datelike then need to make sure a series
- # we only end up here if we have not specified
- # numeric_only and yet we have tried a
- # column-by-column reduction, where we have mixed type.
- # So let's just do what we can
- from pandas.core.apply import frame_apply
- opa = frame_apply(
- self, func=f, result_type="expand", ignore_failures=True
- )
- result = opa.get_result()
- if result.ndim == self.ndim:
- result = result.iloc[0]
- return result
- # TODO: why doesnt axis matter here?
- data = _get_data(axis_matters=False)
- with np.errstate(all="ignore"):
- result = f(data.values)
- labels = data._get_agg_axis(axis)
- else:
- if numeric_only:
- data = _get_data(axis_matters=True)
- values = data.values
- labels = data._get_agg_axis(axis)
- else:
- values = self.values
- result = f(values)
- if hasattr(result, "dtype") and is_object_dtype(result.dtype):
- try:
- if filter_type is None or filter_type == "numeric":
- result = result.astype(np.float64)
- elif filter_type == "bool" and notna(result).all():
- result = result.astype(np.bool_)
- except (ValueError, TypeError):
- # try to coerce to the original dtypes item by item if we can
- if axis == 0:
- result = coerce_to_dtypes(result, self.dtypes)
- if constructor is not None:
- result = Series(result, index=labels)
- return result
- def nunique(self, axis=0, dropna=True) -> Series:
- """
- Count distinct observations over requested axis.
- Return Series with number of distinct observations. Can ignore NaN
- values.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
- column-wise.
- dropna : bool, default True
- Don't include NaN in the counts.
- Returns
- -------
- Series
- See Also
- --------
- Series.nunique: Method nunique for Series.
- DataFrame.count: Count non-NA cells for each column or row.
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
- >>> df.nunique()
- A 3
- B 1
- dtype: int64
- >>> df.nunique(axis=1)
- 0 1
- 1 2
- 2 2
- dtype: int64
- """
- return self.apply(Series.nunique, axis=axis, dropna=dropna)
- def idxmin(self, axis=0, skipna=True) -> Series:
- """
- Return index of first occurrence of minimum over requested axis.
- NA/null values are excluded.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
- Returns
- -------
- Series
- Indexes of minima along the specified axis.
- Raises
- ------
- ValueError
- * If the row/column is empty
- See Also
- --------
- Series.idxmin
- Notes
- -----
- This method is the DataFrame version of ``ndarray.argmin``.
- """
- axis = self._get_axis_number(axis)
- indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
- index = self._get_axis(axis)
- result = [index[i] if i >= 0 else np.nan for i in indices]
- return Series(result, index=self._get_agg_axis(axis))
- def idxmax(self, axis=0, skipna=True) -> Series:
- """
- Return index of first occurrence of maximum over requested axis.
- NA/null values are excluded.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
- Returns
- -------
- Series
- Indexes of maxima along the specified axis.
- Raises
- ------
- ValueError
- * If the row/column is empty
- See Also
- --------
- Series.idxmax
- Notes
- -----
- This method is the DataFrame version of ``ndarray.argmax``.
- """
- axis = self._get_axis_number(axis)
- indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
- index = self._get_axis(axis)
- result = [index[i] if i >= 0 else np.nan for i in indices]
- return Series(result, index=self._get_agg_axis(axis))
- def _get_agg_axis(self, axis_num):
- """
- Let's be explicit about this.
- """
- if axis_num == 0:
- return self.columns
- elif axis_num == 1:
- return self.index
- else:
- raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
- def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame":
- """
- Get the mode(s) of each element along the selected axis.
- The mode of a set of values is the value that appears most often.
- It can be multiple values.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to iterate over while searching for the mode:
- * 0 or 'index' : get mode of each column
- * 1 or 'columns' : get mode of each row.
- numeric_only : bool, default False
- If True, only apply to numeric columns.
- dropna : bool, default True
- Don't consider counts of NaN/NaT.
- .. versionadded:: 0.24.0
- Returns
- -------
- DataFrame
- The modes of each column or row.
- See Also
- --------
- Series.mode : Return the highest frequency value in a Series.
- Series.value_counts : Return the counts of values in a Series.
- Examples
- --------
- >>> df = pd.DataFrame([('bird', 2, 2),
- ... ('mammal', 4, np.nan),
- ... ('arthropod', 8, 0),
- ... ('bird', 2, np.nan)],
- ... index=('falcon', 'horse', 'spider', 'ostrich'),
- ... columns=('species', 'legs', 'wings'))
- >>> df
- species legs wings
- falcon bird 2 2.0
- horse mammal 4 NaN
- spider arthropod 8 0.0
- ostrich bird 2 NaN
- By default, missing values are not considered, and the mode of wings
- are both 0 and 2. The second row of species and legs contains ``NaN``,
- because they have only one mode, but the DataFrame has two rows.
- >>> df.mode()
- species legs wings
- 0 bird 2.0 0.0
- 1 NaN NaN 2.0
- Setting ``dropna=False`` ``NaN`` values are considered and they can be
- the mode (like for wings).
- >>> df.mode(dropna=False)
- species legs wings
- 0 bird 2 NaN
- Setting ``numeric_only=True``, only the mode of numeric columns is
- computed, and columns of other types are ignored.
- >>> df.mode(numeric_only=True)
- legs wings
- 0 2.0 0.0
- 1 NaN 2.0
- To compute the mode over columns and not rows, use the axis parameter:
- >>> df.mode(axis='columns', numeric_only=True)
- 0 1
- falcon 2.0 NaN
- horse 4.0 NaN
- spider 0.0 8.0
- ostrich 2.0 NaN
- """
- data = self if not numeric_only else self._get_numeric_data()
- def f(s):
- return s.mode(dropna=dropna)
- return data.apply(f, axis=axis)
- def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
- """
- Return values at the given quantile over requested axis.
- Parameters
- ----------
- q : float or array-like, default 0.5 (50% quantile)
- Value between 0 <= q <= 1, the quantile(s) to compute.
- axis : {0, 1, 'index', 'columns'} (default 0)
- Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- numeric_only : bool, default True
- If False, the quantile of datetime and timedelta data will be
- computed as well.
- interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
- This optional parameter specifies the interpolation method to use,
- when the desired quantile lies between two data points `i` and `j`:
- * linear: `i + (j - i) * fraction`, where `fraction` is the
- fractional part of the index surrounded by `i` and `j`.
- * lower: `i`.
- * higher: `j`.
- * nearest: `i` or `j` whichever is nearest.
- * midpoint: (`i` + `j`) / 2.
- Returns
- -------
- Series or DataFrame
- If ``q`` is an array, a DataFrame will be returned where the
- index is ``q``, the columns are the columns of self, and the
- values are the quantiles.
- If ``q`` is a float, a Series will be returned where the
- index is the columns of self and the values are the quantiles.
- See Also
- --------
- core.window.Rolling.quantile: Rolling quantile.
- numpy.percentile: Numpy function to compute the percentile.
- Examples
- --------
- >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
- ... columns=['a', 'b'])
- >>> df.quantile(.1)
- a 1.3
- b 3.7
- Name: 0.1, dtype: float64
- >>> df.quantile([.1, .5])
- a b
- 0.1 1.3 3.7
- 0.5 2.5 55.0
- Specifying `numeric_only=False` will also compute the quantile of
- datetime and timedelta data.
- >>> df = pd.DataFrame({'A': [1, 2],
- ... 'B': [pd.Timestamp('2010'),
- ... pd.Timestamp('2011')],
- ... 'C': [pd.Timedelta('1 days'),
- ... pd.Timedelta('2 days')]})
- >>> df.quantile(0.5, numeric_only=False)
- A 1.5
- B 2010-07-02 12:00:00
- C 1 days 12:00:00
- Name: 0.5, dtype: object
- """
- validate_percentile(q)
- data = self._get_numeric_data() if numeric_only else self
- axis = self._get_axis_number(axis)
- is_transposed = axis == 1
- if is_transposed:
- data = data.T
- if len(data.columns) == 0:
- # GH#23925 _get_numeric_data may have dropped all columns
- cols = Index([], name=self.columns.name)
- if is_list_like(q):
- return self._constructor([], index=q, columns=cols)
- return self._constructor_sliced([], index=cols, name=q, dtype=np.float64)
- result = data._data.quantile(
- qs=q, axis=1, interpolation=interpolation, transposed=is_transposed
- )
- if result.ndim == 2:
- result = self._constructor(result)
- else:
- result = self._constructor_sliced(result, name=q)
- if is_transposed:
- result = result.T
- return result
- def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame":
- """
- Cast to DatetimeIndex of timestamps, at *beginning* of period.
- Parameters
- ----------
- freq : str, default frequency of PeriodIndex
- Desired frequency.
- how : {'s', 'e', 'start', 'end'}
- Convention for converting period to timestamp; start of period
- vs. end.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to convert (the index by default).
- copy : bool, default True
- If False then underlying input data is not copied.
- Returns
- -------
- DataFrame with DatetimeIndex
- """
- new_data = self._data
- if copy:
- new_data = new_data.copy()
- axis = self._get_axis_number(axis)
- if axis == 0:
- new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))
- elif axis == 1:
- new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
- else: # pragma: no cover
- raise AssertionError(f"Axis must be 0 or 1. Got {axis}")
- return self._constructor(new_data)
- def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame":
- """
- Convert DataFrame from DatetimeIndex to PeriodIndex.
- Convert DataFrame from DatetimeIndex to PeriodIndex with desired
- frequency (inferred from index if not passed).
- Parameters
- ----------
- freq : str, default
- Frequency of the PeriodIndex.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to convert (the index by default).
- copy : bool, default True
- If False then underlying input data is not copied.
- Returns
- -------
- TimeSeries with PeriodIndex
- """
- new_data = self._data
- if copy:
- new_data = new_data.copy()
- axis = self._get_axis_number(axis)
- if axis == 0:
- new_data.set_axis(1, self.index.to_period(freq=freq))
- elif axis == 1:
- new_data.set_axis(0, self.columns.to_period(freq=freq))
- else: # pragma: no cover
- raise AssertionError(f"Axis must be 0 or 1. Got {axis}")
- return self._constructor(new_data)
- def isin(self, values) -> "DataFrame":
- """
- Whether each element in the DataFrame is contained in values.
- Parameters
- ----------
- values : iterable, Series, DataFrame or dict
- The result will only be true at a location if all the
- labels match. If `values` is a Series, that's the index. If
- `values` is a dict, the keys must be the column names,
- which must match. If `values` is a DataFrame,
- then both the index and column labels must match.
- Returns
- -------
- DataFrame
- DataFrame of booleans showing whether each element in the DataFrame
- is contained in values.
- See Also
- --------
- DataFrame.eq: Equality test for DataFrame.
- Series.isin: Equivalent method on Series.
- Series.str.contains: Test if pattern or regex is contained within a
- string of a Series or Index.
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
- ... index=['falcon', 'dog'])
- >>> df
- num_legs num_wings
- falcon 2 2
- dog 4 0
- When ``values`` is a list check whether every value in the DataFrame
- is present in the list (which animals have 0 or 2 legs or wings)
- >>> df.isin([0, 2])
- num_legs num_wings
- falcon True True
- dog False True
- When ``values`` is a dict, we can pass values to check for each
- column separately:
- >>> df.isin({'num_wings': [0, 3]})
- num_legs num_wings
- falcon False False
- dog False True
- When ``values`` is a Series or DataFrame the index and column must
- match. Note that 'falcon' does not match based on the number of legs
- in df2.
- >>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]},
- ... index=['spider', 'falcon'])
- >>> df.isin(other)
- num_legs num_wings
- falcon True True
- dog False False
- """
- if isinstance(values, dict):
- from pandas.core.reshape.concat import concat
- values = collections.defaultdict(list, values)
- return concat(
- (
- self.iloc[:, [i]].isin(values[col])
- for i, col in enumerate(self.columns)
- ),
- axis=1,
- )
- elif isinstance(values, Series):
- if not values.index.is_unique:
- raise ValueError("cannot compute isin with a duplicate axis.")
- return self.eq(values.reindex_like(self), axis="index")
- elif isinstance(values, DataFrame):
- if not (values.columns.is_unique and values.index.is_unique):
- raise ValueError("cannot compute isin with a duplicate axis.")
- return self.eq(values.reindex_like(self))
- else:
- if not is_list_like(values):
- raise TypeError(
- "only list-like or dict-like objects are allowed "
- "to be passed to DataFrame.isin(), "
- f"you passed a {repr(type(values).__name__)}"
- )
- return DataFrame(
- algorithms.isin(self.values.ravel(), values).reshape(self.shape),
- self.index,
- self.columns,
- )
- # ----------------------------------------------------------------------
- # Add plotting methods to DataFrame
- plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
- hist = pandas.plotting.hist_frame
- boxplot = pandas.plotting.boxplot_frame
- sparse = CachedAccessor("sparse", SparseFrameAccessor)
- DataFrame._setup_axes(
- ["index", "columns"],
- docs={
- "index": "The index (row labels) of the DataFrame.",
- "columns": "The column labels of the DataFrame.",
- },
- )
- DataFrame._add_numeric_operations()
- DataFrame._add_series_or_dataframe_operations()
- ops.add_flex_arithmetic_methods(DataFrame)
- ops.add_special_arithmetic_methods(DataFrame)
- def _from_nested_dict(data):
- # TODO: this should be seriously cythonized
- new_data = {}
- for index, s in data.items():
- for col, v in s.items():
- new_data[col] = new_data.get(col, {})
- new_data[col][index] = v
- return new_data
- def _put_str(s, space):
- return str(s)[:space].ljust(space)
|