frame.py 286 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474
  1. """
  2. DataFrame
  3. ---------
  4. An efficient 2D container for potentially mixed-type time series or other
  5. labeled data series.
  6. Similar to its R counterpart, data.frame, except providing automatic data
  7. alignment and a host of useful data manipulation methods having to do with the
  8. labeling information
  9. """
  10. import collections
  11. from collections import abc
  12. from io import StringIO
  13. import itertools
  14. import sys
  15. from textwrap import dedent
  16. from typing import (
  17. IO,
  18. TYPE_CHECKING,
  19. Any,
  20. FrozenSet,
  21. Hashable,
  22. Iterable,
  23. List,
  24. Optional,
  25. Sequence,
  26. Set,
  27. Tuple,
  28. Type,
  29. Union,
  30. cast,
  31. )
  32. import warnings
  33. import numpy as np
  34. import numpy.ma as ma
  35. from pandas._config import get_option
  36. from pandas._libs import algos as libalgos, lib
  37. from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer
  38. from pandas.compat import PY37
  39. from pandas.compat._optional import import_optional_dependency
  40. from pandas.compat.numpy import function as nv
  41. from pandas.util._decorators import (
  42. Appender,
  43. Substitution,
  44. deprecate_kwarg,
  45. rewrite_axis_style_signature,
  46. )
  47. from pandas.util._validators import (
  48. validate_axis_style_args,
  49. validate_bool_kwarg,
  50. validate_percentile,
  51. )
  52. from pandas.core.dtypes.cast import (
  53. cast_scalar_to_array,
  54. coerce_to_dtypes,
  55. find_common_type,
  56. infer_dtype_from_scalar,
  57. invalidate_string_dtypes,
  58. maybe_cast_to_datetime,
  59. maybe_convert_platform,
  60. maybe_downcast_to_dtype,
  61. maybe_infer_to_datetimelike,
  62. maybe_upcast,
  63. maybe_upcast_putmask,
  64. )
  65. from pandas.core.dtypes.common import (
  66. ensure_float64,
  67. ensure_int64,
  68. ensure_platform_int,
  69. infer_dtype_from_object,
  70. is_bool_dtype,
  71. is_dict_like,
  72. is_dtype_equal,
  73. is_extension_array_dtype,
  74. is_float_dtype,
  75. is_hashable,
  76. is_integer,
  77. is_integer_dtype,
  78. is_iterator,
  79. is_list_like,
  80. is_named_tuple,
  81. is_object_dtype,
  82. is_scalar,
  83. is_sequence,
  84. needs_i8_conversion,
  85. )
  86. from pandas.core.dtypes.generic import (
  87. ABCDataFrame,
  88. ABCIndexClass,
  89. ABCMultiIndex,
  90. ABCSeries,
  91. )
  92. from pandas.core.dtypes.missing import isna, notna
  93. from pandas.core import algorithms, common as com, nanops, ops
  94. from pandas.core.accessor import CachedAccessor
  95. from pandas.core.arrays import Categorical, ExtensionArray
  96. from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray
  97. from pandas.core.arrays.sparse import SparseFrameAccessor
  98. from pandas.core.generic import NDFrame, _shared_docs
  99. from pandas.core.groupby import generic as groupby_generic
  100. from pandas.core.indexes import base as ibase
  101. from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences
  102. from pandas.core.indexes.datetimes import DatetimeIndex
  103. from pandas.core.indexes.multi import maybe_droplevels
  104. from pandas.core.indexes.period import PeriodIndex
  105. from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable
  106. from pandas.core.internals import BlockManager
  107. from pandas.core.internals.construction import (
  108. arrays_to_mgr,
  109. get_names_from_index,
  110. init_dict,
  111. init_ndarray,
  112. masked_rec_array_to_mgr,
  113. reorder_arrays,
  114. sanitize_index,
  115. to_arrays,
  116. )
  117. from pandas.core.ops.missing import dispatch_fill_zeros
  118. from pandas.core.series import Series
  119. from pandas.io.common import get_filepath_or_buffer
  120. from pandas.io.formats import console, format as fmt
  121. from pandas.io.formats.printing import pprint_thing
  122. import pandas.plotting
  123. if TYPE_CHECKING:
  124. from pandas.io.formats.style import Styler
  125. # ---------------------------------------------------------------------
  126. # Docstring templates
  127. _shared_doc_kwargs = dict(
  128. axes="index, columns",
  129. klass="DataFrame",
  130. axes_single_arg="{0 or 'index', 1 or 'columns'}",
  131. axis="""axis : {0 or 'index', 1 or 'columns'}, default 0
  132. If 0 or 'index': apply function to each column.
  133. If 1 or 'columns': apply function to each row.""",
  134. optional_by="""
  135. by : str or list of str
  136. Name or list of names to sort by.
  137. - if `axis` is 0 or `'index'` then `by` may contain index
  138. levels and/or column labels.
  139. - if `axis` is 1 or `'columns'` then `by` may contain column
  140. levels and/or index labels.
  141. .. versionchanged:: 0.23.0
  142. Allow specifying index or column level names.""",
  143. versionadded_to_excel="",
  144. optional_labels="""labels : array-like, optional
  145. New labels / index to conform the axis specified by 'axis' to.""",
  146. optional_axis="""axis : int or str, optional
  147. Axis to target. Can be either the axis name ('index', 'columns')
  148. or number (0, 1).""",
  149. )
  150. _numeric_only_doc = """numeric_only : boolean, default None
  151. Include only float, int, boolean data. If None, will attempt to use
  152. everything, then use only numeric data
  153. """
  154. _merge_doc = """
  155. Merge DataFrame or named Series objects with a database-style join.
  156. The join is done on columns or indexes. If joining columns on
  157. columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
  158. on indexes or indexes on a column or columns, the index will be passed on.
  159. Parameters
  160. ----------%s
  161. right : DataFrame or named Series
  162. Object to merge with.
  163. how : {'left', 'right', 'outer', 'inner'}, default 'inner'
  164. Type of merge to be performed.
  165. * left: use only keys from left frame, similar to a SQL left outer join;
  166. preserve key order.
  167. * right: use only keys from right frame, similar to a SQL right outer join;
  168. preserve key order.
  169. * outer: use union of keys from both frames, similar to a SQL full outer
  170. join; sort keys lexicographically.
  171. * inner: use intersection of keys from both frames, similar to a SQL inner
  172. join; preserve the order of the left keys.
  173. on : label or list
  174. Column or index level names to join on. These must be found in both
  175. DataFrames. If `on` is None and not merging on indexes then this defaults
  176. to the intersection of the columns in both DataFrames.
  177. left_on : label or list, or array-like
  178. Column or index level names to join on in the left DataFrame. Can also
  179. be an array or list of arrays of the length of the left DataFrame.
  180. These arrays are treated as if they are columns.
  181. right_on : label or list, or array-like
  182. Column or index level names to join on in the right DataFrame. Can also
  183. be an array or list of arrays of the length of the right DataFrame.
  184. These arrays are treated as if they are columns.
  185. left_index : bool, default False
  186. Use the index from the left DataFrame as the join key(s). If it is a
  187. MultiIndex, the number of keys in the other DataFrame (either the index
  188. or a number of columns) must match the number of levels.
  189. right_index : bool, default False
  190. Use the index from the right DataFrame as the join key. Same caveats as
  191. left_index.
  192. sort : bool, default False
  193. Sort the join keys lexicographically in the result DataFrame. If False,
  194. the order of the join keys depends on the join type (how keyword).
  195. suffixes : tuple of (str, str), default ('_x', '_y')
  196. Suffix to apply to overlapping column names in the left and right
  197. side, respectively. To raise an exception on overlapping columns use
  198. (False, False).
  199. copy : bool, default True
  200. If False, avoid copy if possible.
  201. indicator : bool or str, default False
  202. If True, adds a column to output DataFrame called "_merge" with
  203. information on the source of each row.
  204. If string, column with information on source of each row will be added to
  205. output DataFrame, and column will be named value of string.
  206. Information column is Categorical-type and takes on a value of "left_only"
  207. for observations whose merge key only appears in 'left' DataFrame,
  208. "right_only" for observations whose merge key only appears in 'right'
  209. DataFrame, and "both" if the observation's merge key is found in both.
  210. validate : str, optional
  211. If specified, checks if merge is of specified type.
  212. * "one_to_one" or "1:1": check if merge keys are unique in both
  213. left and right datasets.
  214. * "one_to_many" or "1:m": check if merge keys are unique in left
  215. dataset.
  216. * "many_to_one" or "m:1": check if merge keys are unique in right
  217. dataset.
  218. * "many_to_many" or "m:m": allowed, but does not result in checks.
  219. .. versionadded:: 0.21.0
  220. Returns
  221. -------
  222. DataFrame
  223. A DataFrame of the two merged objects.
  224. See Also
  225. --------
  226. merge_ordered : Merge with optional filling/interpolation.
  227. merge_asof : Merge on nearest keys.
  228. DataFrame.join : Similar method using indices.
  229. Notes
  230. -----
  231. Support for specifying index levels as the `on`, `left_on`, and
  232. `right_on` parameters was added in version 0.23.0
  233. Support for merging named Series objects was added in version 0.24.0
  234. Examples
  235. --------
  236. >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
  237. ... 'value': [1, 2, 3, 5]})
  238. >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
  239. ... 'value': [5, 6, 7, 8]})
  240. >>> df1
  241. lkey value
  242. 0 foo 1
  243. 1 bar 2
  244. 2 baz 3
  245. 3 foo 5
  246. >>> df2
  247. rkey value
  248. 0 foo 5
  249. 1 bar 6
  250. 2 baz 7
  251. 3 foo 8
  252. Merge df1 and df2 on the lkey and rkey columns. The value columns have
  253. the default suffixes, _x and _y, appended.
  254. >>> df1.merge(df2, left_on='lkey', right_on='rkey')
  255. lkey value_x rkey value_y
  256. 0 foo 1 foo 5
  257. 1 foo 1 foo 8
  258. 2 foo 5 foo 5
  259. 3 foo 5 foo 8
  260. 4 bar 2 bar 6
  261. 5 baz 3 baz 7
  262. Merge DataFrames df1 and df2 with specified left and right suffixes
  263. appended to any overlapping columns.
  264. >>> df1.merge(df2, left_on='lkey', right_on='rkey',
  265. ... suffixes=('_left', '_right'))
  266. lkey value_left rkey value_right
  267. 0 foo 1 foo 5
  268. 1 foo 1 foo 8
  269. 2 foo 5 foo 5
  270. 3 foo 5 foo 8
  271. 4 bar 2 bar 6
  272. 5 baz 3 baz 7
  273. Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
  274. any overlapping columns.
  275. >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
  276. Traceback (most recent call last):
  277. ...
  278. ValueError: columns overlap but no suffix specified:
  279. Index(['value'], dtype='object')
  280. """
  281. # -----------------------------------------------------------------------
  282. # DataFrame class
  283. class DataFrame(NDFrame):
  284. """
  285. Two-dimensional, size-mutable, potentially heterogeneous tabular data.
  286. Data structure also contains labeled axes (rows and columns).
  287. Arithmetic operations align on both row and column labels. Can be
  288. thought of as a dict-like container for Series objects. The primary
  289. pandas data structure.
  290. Parameters
  291. ----------
  292. data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
  293. Dict can contain Series, arrays, constants, or list-like objects.
  294. .. versionchanged:: 0.23.0
  295. If data is a dict, column order follows insertion-order for
  296. Python 3.6 and later.
  297. .. versionchanged:: 0.25.0
  298. If data is a list of dicts, column order follows insertion-order
  299. for Python 3.6 and later.
  300. index : Index or array-like
  301. Index to use for resulting frame. Will default to RangeIndex if
  302. no indexing information part of input data and no index provided.
  303. columns : Index or array-like
  304. Column labels to use for resulting frame. Will default to
  305. RangeIndex (0, 1, 2, ..., n) if no column labels are provided.
  306. dtype : dtype, default None
  307. Data type to force. Only a single dtype is allowed. If None, infer.
  308. copy : bool, default False
  309. Copy data from inputs. Only affects DataFrame / 2d ndarray input.
  310. See Also
  311. --------
  312. DataFrame.from_records : Constructor from tuples, also record arrays.
  313. DataFrame.from_dict : From dicts of Series, arrays, or dicts.
  314. read_csv
  315. read_table
  316. read_clipboard
  317. Examples
  318. --------
  319. Constructing DataFrame from a dictionary.
  320. >>> d = {'col1': [1, 2], 'col2': [3, 4]}
  321. >>> df = pd.DataFrame(data=d)
  322. >>> df
  323. col1 col2
  324. 0 1 3
  325. 1 2 4
  326. Notice that the inferred dtype is int64.
  327. >>> df.dtypes
  328. col1 int64
  329. col2 int64
  330. dtype: object
  331. To enforce a single dtype:
  332. >>> df = pd.DataFrame(data=d, dtype=np.int8)
  333. >>> df.dtypes
  334. col1 int8
  335. col2 int8
  336. dtype: object
  337. Constructing DataFrame from numpy ndarray:
  338. >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
  339. ... columns=['a', 'b', 'c'])
  340. >>> df2
  341. a b c
  342. 0 1 2 3
  343. 1 4 5 6
  344. 2 7 8 9
  345. """
  346. _typ = "dataframe"
  347. @property
  348. def _constructor(self) -> Type["DataFrame"]:
  349. return DataFrame
  350. _constructor_sliced: Type[Series] = Series
  351. _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([])
  352. _accessors: Set[str] = {"sparse"}
  353. @property
  354. def _constructor_expanddim(self):
  355. raise NotImplementedError("Not supported for DataFrames!")
  356. # ----------------------------------------------------------------------
  357. # Constructors
  358. def __init__(
  359. self,
  360. data=None,
  361. index: Optional[Axes] = None,
  362. columns: Optional[Axes] = None,
  363. dtype: Optional[Dtype] = None,
  364. copy: bool = False,
  365. ):
  366. if data is None:
  367. data = {}
  368. if dtype is not None:
  369. dtype = self._validate_dtype(dtype)
  370. if isinstance(data, DataFrame):
  371. data = data._data
  372. if isinstance(data, BlockManager):
  373. mgr = self._init_mgr(
  374. data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
  375. )
  376. elif isinstance(data, dict):
  377. mgr = init_dict(data, index, columns, dtype=dtype)
  378. elif isinstance(data, ma.MaskedArray):
  379. import numpy.ma.mrecords as mrecords
  380. # masked recarray
  381. if isinstance(data, mrecords.MaskedRecords):
  382. mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
  383. # a masked array
  384. else:
  385. mask = ma.getmaskarray(data)
  386. if mask.any():
  387. data, fill_value = maybe_upcast(data, copy=True)
  388. data.soften_mask() # set hardmask False if it was True
  389. data[mask] = fill_value
  390. else:
  391. data = data.copy()
  392. mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
  393. elif isinstance(data, (np.ndarray, Series, Index)):
  394. if data.dtype.names:
  395. data_columns = list(data.dtype.names)
  396. data = {k: data[k] for k in data_columns}
  397. if columns is None:
  398. columns = data_columns
  399. mgr = init_dict(data, index, columns, dtype=dtype)
  400. elif getattr(data, "name", None) is not None:
  401. mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
  402. else:
  403. mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
  404. # For data is list-like, or Iterable (will consume into list)
  405. elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
  406. if not isinstance(data, (abc.Sequence, ExtensionArray)):
  407. data = list(data)
  408. if len(data) > 0:
  409. if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
  410. if is_named_tuple(data[0]) and columns is None:
  411. columns = data[0]._fields
  412. arrays, columns = to_arrays(data, columns, dtype=dtype)
  413. columns = ensure_index(columns)
  414. # set the index
  415. if index is None:
  416. if isinstance(data[0], Series):
  417. index = get_names_from_index(data)
  418. elif isinstance(data[0], Categorical):
  419. index = ibase.default_index(len(data[0]))
  420. else:
  421. index = ibase.default_index(len(data))
  422. mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
  423. else:
  424. mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
  425. else:
  426. mgr = init_dict({}, index, columns, dtype=dtype)
  427. else:
  428. try:
  429. arr = np.array(data, dtype=dtype, copy=copy)
  430. except (ValueError, TypeError) as e:
  431. exc = TypeError(
  432. "DataFrame constructor called with "
  433. f"incompatible data and dtype: {e}"
  434. )
  435. raise exc from e
  436. if arr.ndim == 0 and index is not None and columns is not None:
  437. values = cast_scalar_to_array(
  438. (len(index), len(columns)), data, dtype=dtype
  439. )
  440. mgr = init_ndarray(
  441. values, index, columns, dtype=values.dtype, copy=False
  442. )
  443. else:
  444. raise ValueError("DataFrame constructor not properly called!")
  445. NDFrame.__init__(self, mgr, fastpath=True)
  446. # ----------------------------------------------------------------------
  447. @property
  448. def axes(self) -> List[Index]:
  449. """
  450. Return a list representing the axes of the DataFrame.
  451. It has the row axis labels and column axis labels as the only members.
  452. They are returned in that order.
  453. Examples
  454. --------
  455. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  456. >>> df.axes
  457. [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
  458. dtype='object')]
  459. """
  460. return [self.index, self.columns]
  461. @property
  462. def shape(self) -> Tuple[int, int]:
  463. """
  464. Return a tuple representing the dimensionality of the DataFrame.
  465. See Also
  466. --------
  467. ndarray.shape
  468. Examples
  469. --------
  470. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  471. >>> df.shape
  472. (2, 2)
  473. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
  474. ... 'col3': [5, 6]})
  475. >>> df.shape
  476. (2, 3)
  477. """
  478. return len(self.index), len(self.columns)
  479. @property
  480. def _is_homogeneous_type(self) -> bool:
  481. """
  482. Whether all the columns in a DataFrame have the same type.
  483. Returns
  484. -------
  485. bool
  486. See Also
  487. --------
  488. Index._is_homogeneous_type : Whether the object has a single
  489. dtype.
  490. MultiIndex._is_homogeneous_type : Whether all the levels of a
  491. MultiIndex have the same dtype.
  492. Examples
  493. --------
  494. >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
  495. True
  496. >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
  497. False
  498. Items with the same type but different sizes are considered
  499. different types.
  500. >>> DataFrame({
  501. ... "A": np.array([1, 2], dtype=np.int32),
  502. ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
  503. False
  504. """
  505. if self._data.any_extension_types:
  506. return len({block.dtype for block in self._data.blocks}) == 1
  507. else:
  508. return not self._data.is_mixed_type
  509. # ----------------------------------------------------------------------
  510. # Rendering Methods
  511. def _repr_fits_vertical_(self) -> bool:
  512. """
  513. Check length against max_rows.
  514. """
  515. max_rows = get_option("display.max_rows")
  516. return len(self) <= max_rows
  517. def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
  518. """
  519. Check if full repr fits in horizontal boundaries imposed by the display
  520. options width and max_columns.
  521. In case off non-interactive session, no boundaries apply.
  522. `ignore_width` is here so ipnb+HTML output can behave the way
  523. users expect. display.max_columns remains in effect.
  524. GH3541, GH3573
  525. """
  526. width, height = console.get_console_size()
  527. max_columns = get_option("display.max_columns")
  528. nb_columns = len(self.columns)
  529. # exceed max columns
  530. if (max_columns and nb_columns > max_columns) or (
  531. (not ignore_width) and width and nb_columns > (width // 2)
  532. ):
  533. return False
  534. # used by repr_html under IPython notebook or scripts ignore terminal
  535. # dims
  536. if ignore_width or not console.in_interactive_session():
  537. return True
  538. if get_option("display.width") is not None or console.in_ipython_frontend():
  539. # check at least the column row for excessive width
  540. max_rows = 1
  541. else:
  542. max_rows = get_option("display.max_rows")
  543. # when auto-detecting, so width=None and not in ipython front end
  544. # check whether repr fits horizontal by actually checking
  545. # the width of the rendered repr
  546. buf = StringIO()
  547. # only care about the stuff we'll actually print out
  548. # and to_string on entire frame may be expensive
  549. d = self
  550. if not (max_rows is None): # unlimited rows
  551. # min of two, where one may be None
  552. d = d.iloc[: min(max_rows, len(d))]
  553. else:
  554. return True
  555. d.to_string(buf=buf)
  556. value = buf.getvalue()
  557. repr_width = max(len(l) for l in value.split("\n"))
  558. return repr_width < width
  559. def _info_repr(self) -> bool:
  560. """
  561. True if the repr should show the info view.
  562. """
  563. info_repr_option = get_option("display.large_repr") == "info"
  564. return info_repr_option and not (
  565. self._repr_fits_horizontal_() and self._repr_fits_vertical_()
  566. )
  567. def __repr__(self) -> str:
  568. """
  569. Return a string representation for a particular DataFrame.
  570. """
  571. buf = StringIO("")
  572. if self._info_repr():
  573. self.info(buf=buf)
  574. return buf.getvalue()
  575. max_rows = get_option("display.max_rows")
  576. min_rows = get_option("display.min_rows")
  577. max_cols = get_option("display.max_columns")
  578. max_colwidth = get_option("display.max_colwidth")
  579. show_dimensions = get_option("display.show_dimensions")
  580. if get_option("display.expand_frame_repr"):
  581. width, _ = console.get_console_size()
  582. else:
  583. width = None
  584. self.to_string(
  585. buf=buf,
  586. max_rows=max_rows,
  587. min_rows=min_rows,
  588. max_cols=max_cols,
  589. line_width=width,
  590. max_colwidth=max_colwidth,
  591. show_dimensions=show_dimensions,
  592. )
  593. return buf.getvalue()
  594. def _repr_html_(self) -> Optional[str]:
  595. """
  596. Return a html representation for a particular DataFrame.
  597. Mainly for IPython notebook.
  598. """
  599. if self._info_repr():
  600. buf = StringIO("")
  601. self.info(buf=buf)
  602. # need to escape the <class>, should be the first line.
  603. val = buf.getvalue().replace("<", r"&lt;", 1)
  604. val = val.replace(">", r"&gt;", 1)
  605. return "<pre>" + val + "</pre>"
  606. if get_option("display.notebook_repr_html"):
  607. max_rows = get_option("display.max_rows")
  608. min_rows = get_option("display.min_rows")
  609. max_cols = get_option("display.max_columns")
  610. show_dimensions = get_option("display.show_dimensions")
  611. formatter = fmt.DataFrameFormatter(
  612. self,
  613. columns=None,
  614. col_space=None,
  615. na_rep="NaN",
  616. formatters=None,
  617. float_format=None,
  618. sparsify=None,
  619. justify=None,
  620. index_names=True,
  621. header=True,
  622. index=True,
  623. bold_rows=True,
  624. escape=True,
  625. max_rows=max_rows,
  626. min_rows=min_rows,
  627. max_cols=max_cols,
  628. show_dimensions=show_dimensions,
  629. decimal=".",
  630. table_id=None,
  631. render_links=False,
  632. )
  633. return formatter.to_html(notebook=True)
  634. else:
  635. return None
  636. @Substitution(
  637. header_type="bool or sequence",
  638. header="Write out the column names. If a list of strings "
  639. "is given, it is assumed to be aliases for the "
  640. "column names",
  641. col_space_type="int",
  642. col_space="The minimum width of each column",
  643. )
  644. @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
  645. def to_string(
  646. self,
  647. buf: Optional[FilePathOrBuffer[str]] = None,
  648. columns: Optional[Sequence[str]] = None,
  649. col_space: Optional[int] = None,
  650. header: Union[bool, Sequence[str]] = True,
  651. index: bool = True,
  652. na_rep: str = "NaN",
  653. formatters: Optional[fmt.formatters_type] = None,
  654. float_format: Optional[fmt.float_format_type] = None,
  655. sparsify: Optional[bool] = None,
  656. index_names: bool = True,
  657. justify: Optional[str] = None,
  658. max_rows: Optional[int] = None,
  659. min_rows: Optional[int] = None,
  660. max_cols: Optional[int] = None,
  661. show_dimensions: bool = False,
  662. decimal: str = ".",
  663. line_width: Optional[int] = None,
  664. max_colwidth: Optional[int] = None,
  665. encoding: Optional[str] = None,
  666. ) -> Optional[str]:
  667. """
  668. Render a DataFrame to a console-friendly tabular output.
  669. %(shared_params)s
  670. line_width : int, optional
  671. Width to wrap a line in characters.
  672. max_colwidth : int, optional
  673. Max width to truncate each column in characters. By default, no limit.
  674. .. versionadded:: 1.0.0
  675. encoding : str, default "utf-8"
  676. Set character encoding.
  677. .. versionadded:: 1.0
  678. %(returns)s
  679. See Also
  680. --------
  681. to_html : Convert DataFrame to HTML.
  682. Examples
  683. --------
  684. >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
  685. >>> df = pd.DataFrame(d)
  686. >>> print(df.to_string())
  687. col1 col2
  688. 0 1 4
  689. 1 2 5
  690. 2 3 6
  691. """
  692. from pandas import option_context
  693. with option_context("display.max_colwidth", max_colwidth):
  694. formatter = fmt.DataFrameFormatter(
  695. self,
  696. columns=columns,
  697. col_space=col_space,
  698. na_rep=na_rep,
  699. formatters=formatters,
  700. float_format=float_format,
  701. sparsify=sparsify,
  702. justify=justify,
  703. index_names=index_names,
  704. header=header,
  705. index=index,
  706. min_rows=min_rows,
  707. max_rows=max_rows,
  708. max_cols=max_cols,
  709. show_dimensions=show_dimensions,
  710. decimal=decimal,
  711. line_width=line_width,
  712. )
  713. return formatter.to_string(buf=buf, encoding=encoding)
  714. # ----------------------------------------------------------------------
  715. @property
  716. def style(self) -> "Styler":
  717. """
  718. Returns a Styler object.
  719. Contains methods for building a styled HTML representation of the DataFrame.
  720. a styled HTML representation fo the DataFrame.
  721. See Also
  722. --------
  723. io.formats.style.Styler
  724. """
  725. from pandas.io.formats.style import Styler
  726. return Styler(self)
  727. _shared_docs[
  728. "items"
  729. ] = r"""
  730. Iterate over (column name, Series) pairs.
  731. Iterates over the DataFrame columns, returning a tuple with
  732. the column name and the content as a Series.
  733. Yields
  734. ------
  735. label : object
  736. The column names for the DataFrame being iterated over.
  737. content : Series
  738. The column entries belonging to each label, as a Series.
  739. See Also
  740. --------
  741. DataFrame.iterrows : Iterate over DataFrame rows as
  742. (index, Series) pairs.
  743. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
  744. of the values.
  745. Examples
  746. --------
  747. >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
  748. ... 'population': [1864, 22000, 80000]},
  749. ... index=['panda', 'polar', 'koala'])
  750. >>> df
  751. species population
  752. panda bear 1864
  753. polar bear 22000
  754. koala marsupial 80000
  755. >>> for label, content in df.items():
  756. ... print('label:', label)
  757. ... print('content:', content, sep='\n')
  758. ...
  759. label: species
  760. content:
  761. panda bear
  762. polar bear
  763. koala marsupial
  764. Name: species, dtype: object
  765. label: population
  766. content:
  767. panda 1864
  768. polar 22000
  769. koala 80000
  770. Name: population, dtype: int64
  771. """
  772. @Appender(_shared_docs["items"])
  773. def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]:
  774. if self.columns.is_unique and hasattr(self, "_item_cache"):
  775. for k in self.columns:
  776. yield k, self._get_item_cache(k)
  777. else:
  778. for i, k in enumerate(self.columns):
  779. yield k, self._ixs(i, axis=1)
  780. @Appender(_shared_docs["items"])
  781. def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]:
  782. yield from self.items()
  783. def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]:
  784. """
  785. Iterate over DataFrame rows as (index, Series) pairs.
  786. Yields
  787. ------
  788. index : label or tuple of label
  789. The index of the row. A tuple for a `MultiIndex`.
  790. data : Series
  791. The data of the row as a Series.
  792. it : generator
  793. A generator that iterates over the rows of the frame.
  794. See Also
  795. --------
  796. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
  797. DataFrame.items : Iterate over (column name, Series) pairs.
  798. Notes
  799. -----
  800. 1. Because ``iterrows`` returns a Series for each row,
  801. it does **not** preserve dtypes across the rows (dtypes are
  802. preserved across columns for DataFrames). For example,
  803. >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
  804. >>> row = next(df.iterrows())[1]
  805. >>> row
  806. int 1.0
  807. float 1.5
  808. Name: 0, dtype: float64
  809. >>> print(row['int'].dtype)
  810. float64
  811. >>> print(df['int'].dtype)
  812. int64
  813. To preserve dtypes while iterating over the rows, it is better
  814. to use :meth:`itertuples` which returns namedtuples of the values
  815. and which is generally faster than ``iterrows``.
  816. 2. You should **never modify** something you are iterating over.
  817. This is not guaranteed to work in all cases. Depending on the
  818. data types, the iterator returns a copy and not a view, and writing
  819. to it will have no effect.
  820. """
  821. columns = self.columns
  822. klass = self._constructor_sliced
  823. for k, v in zip(self.index, self.values):
  824. s = klass(v, index=columns, name=k)
  825. yield k, s
  826. def itertuples(self, index=True, name="Pandas"):
  827. """
  828. Iterate over DataFrame rows as namedtuples.
  829. Parameters
  830. ----------
  831. index : bool, default True
  832. If True, return the index as the first element of the tuple.
  833. name : str or None, default "Pandas"
  834. The name of the returned namedtuples or None to return regular
  835. tuples.
  836. Returns
  837. -------
  838. iterator
  839. An object to iterate over namedtuples for each row in the
  840. DataFrame with the first field possibly being the index and
  841. following fields being the column values.
  842. See Also
  843. --------
  844. DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
  845. pairs.
  846. DataFrame.items : Iterate over (column name, Series) pairs.
  847. Notes
  848. -----
  849. The column names will be renamed to positional names if they are
  850. invalid Python identifiers, repeated, or start with an underscore.
  851. On python versions < 3.7 regular tuples are returned for DataFrames
  852. with a large number of columns (>254).
  853. Examples
  854. --------
  855. >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
  856. ... index=['dog', 'hawk'])
  857. >>> df
  858. num_legs num_wings
  859. dog 4 0
  860. hawk 2 2
  861. >>> for row in df.itertuples():
  862. ... print(row)
  863. ...
  864. Pandas(Index='dog', num_legs=4, num_wings=0)
  865. Pandas(Index='hawk', num_legs=2, num_wings=2)
  866. By setting the `index` parameter to False we can remove the index
  867. as the first element of the tuple:
  868. >>> for row in df.itertuples(index=False):
  869. ... print(row)
  870. ...
  871. Pandas(num_legs=4, num_wings=0)
  872. Pandas(num_legs=2, num_wings=2)
  873. With the `name` parameter set we set a custom name for the yielded
  874. namedtuples:
  875. >>> for row in df.itertuples(name='Animal'):
  876. ... print(row)
  877. ...
  878. Animal(Index='dog', num_legs=4, num_wings=0)
  879. Animal(Index='hawk', num_legs=2, num_wings=2)
  880. """
  881. arrays = []
  882. fields = list(self.columns)
  883. if index:
  884. arrays.append(self.index)
  885. fields.insert(0, "Index")
  886. # use integer indexing because of possible duplicate column names
  887. arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
  888. # Python versions before 3.7 support at most 255 arguments to constructors
  889. can_return_named_tuples = PY37 or len(self.columns) + index < 255
  890. if name is not None and can_return_named_tuples:
  891. itertuple = collections.namedtuple(name, fields, rename=True)
  892. return map(itertuple._make, zip(*arrays))
  893. # fallback to regular tuples
  894. return zip(*arrays)
  895. def __len__(self) -> int:
  896. """
  897. Returns length of info axis, but here we use the index.
  898. """
  899. return len(self.index)
  900. def dot(self, other):
  901. """
  902. Compute the matrix multiplication between the DataFrame and other.
  903. This method computes the matrix product between the DataFrame and the
  904. values of an other Series, DataFrame or a numpy array.
  905. It can also be called using ``self @ other`` in Python >= 3.5.
  906. Parameters
  907. ----------
  908. other : Series, DataFrame or array-like
  909. The other object to compute the matrix product with.
  910. Returns
  911. -------
  912. Series or DataFrame
  913. If other is a Series, return the matrix product between self and
  914. other as a Serie. If other is a DataFrame or a numpy.array, return
  915. the matrix product of self and other in a DataFrame of a np.array.
  916. See Also
  917. --------
  918. Series.dot: Similar method for Series.
  919. Notes
  920. -----
  921. The dimensions of DataFrame and other must be compatible in order to
  922. compute the matrix multiplication. In addition, the column names of
  923. DataFrame and the index of other must contain the same values, as they
  924. will be aligned prior to the multiplication.
  925. The dot method for Series computes the inner product, instead of the
  926. matrix product here.
  927. Examples
  928. --------
  929. Here we multiply a DataFrame with a Series.
  930. >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
  931. >>> s = pd.Series([1, 1, 2, 1])
  932. >>> df.dot(s)
  933. 0 -4
  934. 1 5
  935. dtype: int64
  936. Here we multiply a DataFrame with another DataFrame.
  937. >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
  938. >>> df.dot(other)
  939. 0 1
  940. 0 1 4
  941. 1 2 2
  942. Note that the dot method give the same result as @
  943. >>> df @ other
  944. 0 1
  945. 0 1 4
  946. 1 2 2
  947. The dot method works also if other is an np.array.
  948. >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
  949. >>> df.dot(arr)
  950. 0 1
  951. 0 1 4
  952. 1 2 2
  953. Note how shuffling of the objects does not change the result.
  954. >>> s2 = s.reindex([1, 0, 2, 3])
  955. >>> df.dot(s2)
  956. 0 -4
  957. 1 5
  958. dtype: int64
  959. """
  960. if isinstance(other, (Series, DataFrame)):
  961. common = self.columns.union(other.index)
  962. if len(common) > len(self.columns) or len(common) > len(other.index):
  963. raise ValueError("matrices are not aligned")
  964. left = self.reindex(columns=common, copy=False)
  965. right = other.reindex(index=common, copy=False)
  966. lvals = left.values
  967. rvals = right.values
  968. else:
  969. left = self
  970. lvals = self.values
  971. rvals = np.asarray(other)
  972. if lvals.shape[1] != rvals.shape[0]:
  973. raise ValueError(
  974. f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
  975. )
  976. if isinstance(other, DataFrame):
  977. return self._constructor(
  978. np.dot(lvals, rvals), index=left.index, columns=other.columns
  979. )
  980. elif isinstance(other, Series):
  981. return Series(np.dot(lvals, rvals), index=left.index)
  982. elif isinstance(rvals, (np.ndarray, Index)):
  983. result = np.dot(lvals, rvals)
  984. if result.ndim == 2:
  985. return self._constructor(result, index=left.index)
  986. else:
  987. return Series(result, index=left.index)
  988. else: # pragma: no cover
  989. raise TypeError(f"unsupported type: {type(other)}")
  990. def __matmul__(self, other):
  991. """
  992. Matrix multiplication using binary `@` operator in Python>=3.5.
  993. """
  994. return self.dot(other)
  995. def __rmatmul__(self, other):
  996. """
  997. Matrix multiplication using binary `@` operator in Python>=3.5.
  998. """
  999. return self.T.dot(np.transpose(other)).T
  1000. # ----------------------------------------------------------------------
  1001. # IO methods (to / from other formats)
  1002. @classmethod
  1003. def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame":
  1004. """
  1005. Construct DataFrame from dict of array-like or dicts.
  1006. Creates DataFrame object from dictionary by columns or by index
  1007. allowing dtype specification.
  1008. Parameters
  1009. ----------
  1010. data : dict
  1011. Of the form {field : array-like} or {field : dict}.
  1012. orient : {'columns', 'index'}, default 'columns'
  1013. The "orientation" of the data. If the keys of the passed dict
  1014. should be the columns of the resulting DataFrame, pass 'columns'
  1015. (default). Otherwise if the keys should be rows, pass 'index'.
  1016. dtype : dtype, default None
  1017. Data type to force, otherwise infer.
  1018. columns : list, default None
  1019. Column labels to use when ``orient='index'``. Raises a ValueError
  1020. if used with ``orient='columns'``.
  1021. .. versionadded:: 0.23.0
  1022. Returns
  1023. -------
  1024. DataFrame
  1025. See Also
  1026. --------
  1027. DataFrame.from_records : DataFrame from ndarray (structured
  1028. dtype), list of tuples, dict, or DataFrame.
  1029. DataFrame : DataFrame object creation using constructor.
  1030. Examples
  1031. --------
  1032. By default the keys of the dict become the DataFrame columns:
  1033. >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
  1034. >>> pd.DataFrame.from_dict(data)
  1035. col_1 col_2
  1036. 0 3 a
  1037. 1 2 b
  1038. 2 1 c
  1039. 3 0 d
  1040. Specify ``orient='index'`` to create the DataFrame using dictionary
  1041. keys as rows:
  1042. >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
  1043. >>> pd.DataFrame.from_dict(data, orient='index')
  1044. 0 1 2 3
  1045. row_1 3 2 1 0
  1046. row_2 a b c d
  1047. When using the 'index' orientation, the column names can be
  1048. specified manually:
  1049. >>> pd.DataFrame.from_dict(data, orient='index',
  1050. ... columns=['A', 'B', 'C', 'D'])
  1051. A B C D
  1052. row_1 3 2 1 0
  1053. row_2 a b c d
  1054. """
  1055. index = None
  1056. orient = orient.lower()
  1057. if orient == "index":
  1058. if len(data) > 0:
  1059. # TODO speed up Series case
  1060. if isinstance(list(data.values())[0], (Series, dict)):
  1061. data = _from_nested_dict(data)
  1062. else:
  1063. data, index = list(data.values()), list(data.keys())
  1064. elif orient == "columns":
  1065. if columns is not None:
  1066. raise ValueError("cannot use columns parameter with orient='columns'")
  1067. else: # pragma: no cover
  1068. raise ValueError("only recognize index or columns for orient")
  1069. return cls(data, index=index, columns=columns, dtype=dtype)
  1070. def to_numpy(self, dtype=None, copy=False) -> np.ndarray:
  1071. """
  1072. Convert the DataFrame to a NumPy array.
  1073. .. versionadded:: 0.24.0
  1074. By default, the dtype of the returned array will be the common NumPy
  1075. dtype of all types in the DataFrame. For example, if the dtypes are
  1076. ``float16`` and ``float32``, the results dtype will be ``float32``.
  1077. This may require copying data and coercing values, which may be
  1078. expensive.
  1079. Parameters
  1080. ----------
  1081. dtype : str or numpy.dtype, optional
  1082. The dtype to pass to :meth:`numpy.asarray`.
  1083. copy : bool, default False
  1084. Whether to ensure that the returned value is a not a view on
  1085. another array. Note that ``copy=False`` does not *ensure* that
  1086. ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
  1087. a copy is made, even if not strictly necessary.
  1088. Returns
  1089. -------
  1090. numpy.ndarray
  1091. See Also
  1092. --------
  1093. Series.to_numpy : Similar method for Series.
  1094. Examples
  1095. --------
  1096. >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
  1097. array([[1, 3],
  1098. [2, 4]])
  1099. With heterogeneous data, the lowest common type will have to
  1100. be used.
  1101. >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
  1102. >>> df.to_numpy()
  1103. array([[1. , 3. ],
  1104. [2. , 4.5]])
  1105. For a mix of numeric and non-numeric types, the output array will
  1106. have object dtype.
  1107. >>> df['C'] = pd.date_range('2000', periods=2)
  1108. >>> df.to_numpy()
  1109. array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
  1110. [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
  1111. """
  1112. result = np.array(self.values, dtype=dtype, copy=copy)
  1113. return result
  1114. def to_dict(self, orient="dict", into=dict):
  1115. """
  1116. Convert the DataFrame to a dictionary.
  1117. The type of the key-value pairs can be customized with the parameters
  1118. (see below).
  1119. Parameters
  1120. ----------
  1121. orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
  1122. Determines the type of the values of the dictionary.
  1123. - 'dict' (default) : dict like {column -> {index -> value}}
  1124. - 'list' : dict like {column -> [values]}
  1125. - 'series' : dict like {column -> Series(values)}
  1126. - 'split' : dict like
  1127. {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
  1128. - 'records' : list like
  1129. [{column -> value}, ... , {column -> value}]
  1130. - 'index' : dict like {index -> {column -> value}}
  1131. Abbreviations are allowed. `s` indicates `series` and `sp`
  1132. indicates `split`.
  1133. into : class, default dict
  1134. The collections.abc.Mapping subclass used for all Mappings
  1135. in the return value. Can be the actual class or an empty
  1136. instance of the mapping type you want. If you want a
  1137. collections.defaultdict, you must pass it initialized.
  1138. .. versionadded:: 0.21.0
  1139. Returns
  1140. -------
  1141. dict, list or collections.abc.Mapping
  1142. Return a collections.abc.Mapping object representing the DataFrame.
  1143. The resulting transformation depends on the `orient` parameter.
  1144. See Also
  1145. --------
  1146. DataFrame.from_dict: Create a DataFrame from a dictionary.
  1147. DataFrame.to_json: Convert a DataFrame to JSON format.
  1148. Examples
  1149. --------
  1150. >>> df = pd.DataFrame({'col1': [1, 2],
  1151. ... 'col2': [0.5, 0.75]},
  1152. ... index=['row1', 'row2'])
  1153. >>> df
  1154. col1 col2
  1155. row1 1 0.50
  1156. row2 2 0.75
  1157. >>> df.to_dict()
  1158. {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
  1159. You can specify the return orientation.
  1160. >>> df.to_dict('series')
  1161. {'col1': row1 1
  1162. row2 2
  1163. Name: col1, dtype: int64,
  1164. 'col2': row1 0.50
  1165. row2 0.75
  1166. Name: col2, dtype: float64}
  1167. >>> df.to_dict('split')
  1168. {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
  1169. 'data': [[1, 0.5], [2, 0.75]]}
  1170. >>> df.to_dict('records')
  1171. [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
  1172. >>> df.to_dict('index')
  1173. {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
  1174. You can also specify the mapping type.
  1175. >>> from collections import OrderedDict, defaultdict
  1176. >>> df.to_dict(into=OrderedDict)
  1177. OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
  1178. ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
  1179. If you want a `defaultdict`, you need to initialize it:
  1180. >>> dd = defaultdict(list)
  1181. >>> df.to_dict('records', into=dd)
  1182. [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
  1183. defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
  1184. """
  1185. if not self.columns.is_unique:
  1186. warnings.warn(
  1187. "DataFrame columns are not unique, some columns will be omitted.",
  1188. UserWarning,
  1189. stacklevel=2,
  1190. )
  1191. # GH16122
  1192. into_c = com.standardize_mapping(into)
  1193. if orient.lower().startswith("d"):
  1194. return into_c((k, v.to_dict(into)) for k, v in self.items())
  1195. elif orient.lower().startswith("l"):
  1196. return into_c((k, v.tolist()) for k, v in self.items())
  1197. elif orient.lower().startswith("sp"):
  1198. return into_c(
  1199. (
  1200. ("index", self.index.tolist()),
  1201. ("columns", self.columns.tolist()),
  1202. (
  1203. "data",
  1204. [
  1205. list(map(com.maybe_box_datetimelike, t))
  1206. for t in self.itertuples(index=False, name=None)
  1207. ],
  1208. ),
  1209. )
  1210. )
  1211. elif orient.lower().startswith("s"):
  1212. return into_c((k, com.maybe_box_datetimelike(v)) for k, v in self.items())
  1213. elif orient.lower().startswith("r"):
  1214. columns = self.columns.tolist()
  1215. rows = (
  1216. dict(zip(columns, row))
  1217. for row in self.itertuples(index=False, name=None)
  1218. )
  1219. return [
  1220. into_c((k, com.maybe_box_datetimelike(v)) for k, v in row.items())
  1221. for row in rows
  1222. ]
  1223. elif orient.lower().startswith("i"):
  1224. if not self.index.is_unique:
  1225. raise ValueError("DataFrame index must be unique for orient='index'.")
  1226. return into_c(
  1227. (t[0], dict(zip(self.columns, t[1:])))
  1228. for t in self.itertuples(name=None)
  1229. )
  1230. else:
  1231. raise ValueError(f"orient '{orient}' not understood")
  1232. def to_gbq(
  1233. self,
  1234. destination_table,
  1235. project_id=None,
  1236. chunksize=None,
  1237. reauth=False,
  1238. if_exists="fail",
  1239. auth_local_webserver=False,
  1240. table_schema=None,
  1241. location=None,
  1242. progress_bar=True,
  1243. credentials=None,
  1244. ) -> None:
  1245. """
  1246. Write a DataFrame to a Google BigQuery table.
  1247. This function requires the `pandas-gbq package
  1248. <https://pandas-gbq.readthedocs.io>`__.
  1249. See the `How to authenticate with Google BigQuery
  1250. <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
  1251. guide for authentication instructions.
  1252. Parameters
  1253. ----------
  1254. destination_table : str
  1255. Name of table to be written, in the form ``dataset.tablename``.
  1256. project_id : str, optional
  1257. Google BigQuery Account project ID. Optional when available from
  1258. the environment.
  1259. chunksize : int, optional
  1260. Number of rows to be inserted in each chunk from the dataframe.
  1261. Set to ``None`` to load the whole dataframe at once.
  1262. reauth : bool, default False
  1263. Force Google BigQuery to re-authenticate the user. This is useful
  1264. if multiple accounts are used.
  1265. if_exists : str, default 'fail'
  1266. Behavior when the destination table exists. Value can be one of:
  1267. ``'fail'``
  1268. If table exists raise pandas_gbq.gbq.TableCreationError.
  1269. ``'replace'``
  1270. If table exists, drop it, recreate it, and insert data.
  1271. ``'append'``
  1272. If table exists, insert data. Create if does not exist.
  1273. auth_local_webserver : bool, default False
  1274. Use the `local webserver flow`_ instead of the `console flow`_
  1275. when getting user credentials.
  1276. .. _local webserver flow:
  1277. http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
  1278. .. _console flow:
  1279. http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
  1280. *New in version 0.2.0 of pandas-gbq*.
  1281. table_schema : list of dicts, optional
  1282. List of BigQuery table fields to which according DataFrame
  1283. columns conform to, e.g. ``[{'name': 'col1', 'type':
  1284. 'STRING'},...]``. If schema is not provided, it will be
  1285. generated according to dtypes of DataFrame columns. See
  1286. BigQuery API documentation on available names of a field.
  1287. *New in version 0.3.1 of pandas-gbq*.
  1288. location : str, optional
  1289. Location where the load job should run. See the `BigQuery locations
  1290. documentation
  1291. <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
  1292. list of available locations. The location must match that of the
  1293. target dataset.
  1294. *New in version 0.5.0 of pandas-gbq*.
  1295. progress_bar : bool, default True
  1296. Use the library `tqdm` to show the progress bar for the upload,
  1297. chunk by chunk.
  1298. *New in version 0.5.0 of pandas-gbq*.
  1299. credentials : google.auth.credentials.Credentials, optional
  1300. Credentials for accessing Google APIs. Use this parameter to
  1301. override default credentials, such as to use Compute Engine
  1302. :class:`google.auth.compute_engine.Credentials` or Service
  1303. Account :class:`google.oauth2.service_account.Credentials`
  1304. directly.
  1305. *New in version 0.8.0 of pandas-gbq*.
  1306. .. versionadded:: 0.24.0
  1307. See Also
  1308. --------
  1309. pandas_gbq.to_gbq : This function in the pandas-gbq library.
  1310. read_gbq : Read a DataFrame from Google BigQuery.
  1311. """
  1312. from pandas.io import gbq
  1313. gbq.to_gbq(
  1314. self,
  1315. destination_table,
  1316. project_id=project_id,
  1317. chunksize=chunksize,
  1318. reauth=reauth,
  1319. if_exists=if_exists,
  1320. auth_local_webserver=auth_local_webserver,
  1321. table_schema=table_schema,
  1322. location=location,
  1323. progress_bar=progress_bar,
  1324. credentials=credentials,
  1325. )
  1326. @classmethod
  1327. def from_records(
  1328. cls,
  1329. data,
  1330. index=None,
  1331. exclude=None,
  1332. columns=None,
  1333. coerce_float=False,
  1334. nrows=None,
  1335. ) -> "DataFrame":
  1336. """
  1337. Convert structured or record ndarray to DataFrame.
  1338. Parameters
  1339. ----------
  1340. data : ndarray (structured dtype), list of tuples, dict, or DataFrame
  1341. index : str, list of fields, array-like
  1342. Field of array to use as the index, alternately a specific set of
  1343. input labels to use.
  1344. exclude : sequence, default None
  1345. Columns or fields to exclude.
  1346. columns : sequence, default None
  1347. Column names to use. If the passed data do not have names
  1348. associated with them, this argument provides names for the
  1349. columns. Otherwise this argument indicates the order of the columns
  1350. in the result (any names not found in the data will become all-NA
  1351. columns).
  1352. coerce_float : bool, default False
  1353. Attempt to convert values of non-string, non-numeric objects (like
  1354. decimal.Decimal) to floating point, useful for SQL result sets.
  1355. nrows : int, default None
  1356. Number of rows to read if data is an iterator.
  1357. Returns
  1358. -------
  1359. DataFrame
  1360. """
  1361. # Make a copy of the input columns so we can modify it
  1362. if columns is not None:
  1363. columns = ensure_index(columns)
  1364. if is_iterator(data):
  1365. if nrows == 0:
  1366. return cls()
  1367. try:
  1368. first_row = next(data)
  1369. except StopIteration:
  1370. return cls(index=index, columns=columns)
  1371. dtype = None
  1372. if hasattr(first_row, "dtype") and first_row.dtype.names:
  1373. dtype = first_row.dtype
  1374. values = [first_row]
  1375. if nrows is None:
  1376. values += data
  1377. else:
  1378. values.extend(itertools.islice(data, nrows - 1))
  1379. if dtype is not None:
  1380. data = np.array(values, dtype=dtype)
  1381. else:
  1382. data = values
  1383. if isinstance(data, dict):
  1384. if columns is None:
  1385. columns = arr_columns = ensure_index(sorted(data))
  1386. arrays = [data[k] for k in columns]
  1387. else:
  1388. arrays = []
  1389. arr_columns = []
  1390. for k, v in data.items():
  1391. if k in columns:
  1392. arr_columns.append(k)
  1393. arrays.append(v)
  1394. arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns)
  1395. elif isinstance(data, (np.ndarray, DataFrame)):
  1396. arrays, columns = to_arrays(data, columns)
  1397. if columns is not None:
  1398. columns = ensure_index(columns)
  1399. arr_columns = columns
  1400. else:
  1401. arrays, arr_columns = to_arrays(data, columns, coerce_float=coerce_float)
  1402. arr_columns = ensure_index(arr_columns)
  1403. if columns is not None:
  1404. columns = ensure_index(columns)
  1405. else:
  1406. columns = arr_columns
  1407. if exclude is None:
  1408. exclude = set()
  1409. else:
  1410. exclude = set(exclude)
  1411. result_index = None
  1412. if index is not None:
  1413. if isinstance(index, str) or not hasattr(index, "__iter__"):
  1414. i = columns.get_loc(index)
  1415. exclude.add(index)
  1416. if len(arrays) > 0:
  1417. result_index = Index(arrays[i], name=index)
  1418. else:
  1419. result_index = Index([], name=index)
  1420. else:
  1421. try:
  1422. index_data = [arrays[arr_columns.get_loc(field)] for field in index]
  1423. except (KeyError, TypeError):
  1424. # raised by get_loc, see GH#29258
  1425. result_index = index
  1426. else:
  1427. result_index = ensure_index_from_sequences(index_data, names=index)
  1428. exclude.update(index)
  1429. if any(exclude):
  1430. arr_exclude = [x for x in exclude if x in arr_columns]
  1431. to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
  1432. arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
  1433. arr_columns = arr_columns.drop(arr_exclude)
  1434. columns = columns.drop(exclude)
  1435. mgr = arrays_to_mgr(arrays, arr_columns, result_index, columns)
  1436. return cls(mgr)
  1437. def to_records(
  1438. self, index=True, column_dtypes=None, index_dtypes=None
  1439. ) -> np.recarray:
  1440. """
  1441. Convert DataFrame to a NumPy record array.
  1442. Index will be included as the first field of the record array if
  1443. requested.
  1444. Parameters
  1445. ----------
  1446. index : bool, default True
  1447. Include index in resulting record array, stored in 'index'
  1448. field or using the index label, if set.
  1449. column_dtypes : str, type, dict, default None
  1450. .. versionadded:: 0.24.0
  1451. If a string or type, the data type to store all columns. If
  1452. a dictionary, a mapping of column names and indices (zero-indexed)
  1453. to specific data types.
  1454. index_dtypes : str, type, dict, default None
  1455. .. versionadded:: 0.24.0
  1456. If a string or type, the data type to store all index levels. If
  1457. a dictionary, a mapping of index level names and indices
  1458. (zero-indexed) to specific data types.
  1459. This mapping is applied only if `index=True`.
  1460. Returns
  1461. -------
  1462. numpy.recarray
  1463. NumPy ndarray with the DataFrame labels as fields and each row
  1464. of the DataFrame as entries.
  1465. See Also
  1466. --------
  1467. DataFrame.from_records: Convert structured or record ndarray
  1468. to DataFrame.
  1469. numpy.recarray: An ndarray that allows field access using
  1470. attributes, analogous to typed columns in a
  1471. spreadsheet.
  1472. Examples
  1473. --------
  1474. >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
  1475. ... index=['a', 'b'])
  1476. >>> df
  1477. A B
  1478. a 1 0.50
  1479. b 2 0.75
  1480. >>> df.to_records()
  1481. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  1482. dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
  1483. If the DataFrame index has no label then the recarray field name
  1484. is set to 'index'. If the index has a label then this is used as the
  1485. field name:
  1486. >>> df.index = df.index.rename("I")
  1487. >>> df.to_records()
  1488. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  1489. dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
  1490. The index can be excluded from the record array:
  1491. >>> df.to_records(index=False)
  1492. rec.array([(1, 0.5 ), (2, 0.75)],
  1493. dtype=[('A', '<i8'), ('B', '<f8')])
  1494. Data types can be specified for the columns:
  1495. >>> df.to_records(column_dtypes={"A": "int32"})
  1496. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  1497. dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
  1498. As well as for the index:
  1499. >>> df.to_records(index_dtypes="<S2")
  1500. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  1501. dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
  1502. >>> index_dtypes = f"<S{df.index.str.len().max()}"
  1503. >>> df.to_records(index_dtypes=index_dtypes)
  1504. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  1505. dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
  1506. """
  1507. if index:
  1508. if isinstance(self.index, ABCMultiIndex):
  1509. # array of tuples to numpy cols. copy copy copy
  1510. ix_vals = list(map(np.array, zip(*self.index.values)))
  1511. else:
  1512. ix_vals = [self.index.values]
  1513. arrays = ix_vals + [self[c]._internal_get_values() for c in self.columns]
  1514. count = 0
  1515. index_names = list(self.index.names)
  1516. if isinstance(self.index, ABCMultiIndex):
  1517. for i, n in enumerate(index_names):
  1518. if n is None:
  1519. index_names[i] = f"level_{count}"
  1520. count += 1
  1521. elif index_names[0] is None:
  1522. index_names = ["index"]
  1523. names = [str(name) for name in itertools.chain(index_names, self.columns)]
  1524. else:
  1525. arrays = [self[c]._internal_get_values() for c in self.columns]
  1526. names = [str(c) for c in self.columns]
  1527. index_names = []
  1528. index_len = len(index_names)
  1529. formats = []
  1530. for i, v in enumerate(arrays):
  1531. index = i
  1532. # When the names and arrays are collected, we
  1533. # first collect those in the DataFrame's index,
  1534. # followed by those in its columns.
  1535. #
  1536. # Thus, the total length of the array is:
  1537. # len(index_names) + len(DataFrame.columns).
  1538. #
  1539. # This check allows us to see whether we are
  1540. # handling a name / array in the index or column.
  1541. if index < index_len:
  1542. dtype_mapping = index_dtypes
  1543. name = index_names[index]
  1544. else:
  1545. index -= index_len
  1546. dtype_mapping = column_dtypes
  1547. name = self.columns[index]
  1548. # We have a dictionary, so we get the data type
  1549. # associated with the index or column (which can
  1550. # be denoted by its name in the DataFrame or its
  1551. # position in DataFrame's array of indices or
  1552. # columns, whichever is applicable.
  1553. if is_dict_like(dtype_mapping):
  1554. if name in dtype_mapping:
  1555. dtype_mapping = dtype_mapping[name]
  1556. elif index in dtype_mapping:
  1557. dtype_mapping = dtype_mapping[index]
  1558. else:
  1559. dtype_mapping = None
  1560. # If no mapping can be found, use the array's
  1561. # dtype attribute for formatting.
  1562. #
  1563. # A valid dtype must either be a type or
  1564. # string naming a type.
  1565. if dtype_mapping is None:
  1566. formats.append(v.dtype)
  1567. elif isinstance(dtype_mapping, (type, np.dtype, str)):
  1568. formats.append(dtype_mapping)
  1569. else:
  1570. element = "row" if i < index_len else "column"
  1571. msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
  1572. raise ValueError(msg)
  1573. return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
  1574. @classmethod
  1575. def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame":
  1576. mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
  1577. return cls(mgr)
  1578. @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
  1579. def to_stata(
  1580. self,
  1581. path,
  1582. convert_dates=None,
  1583. write_index=True,
  1584. byteorder=None,
  1585. time_stamp=None,
  1586. data_label=None,
  1587. variable_labels=None,
  1588. version=114,
  1589. convert_strl=None,
  1590. ):
  1591. """
  1592. Export DataFrame object to Stata dta format.
  1593. Writes the DataFrame to a Stata dataset file.
  1594. "dta" files contain a Stata dataset.
  1595. Parameters
  1596. ----------
  1597. path : str, buffer or path object
  1598. String, path object (pathlib.Path or py._path.local.LocalPath) or
  1599. object implementing a binary write() function. If using a buffer
  1600. then the buffer will not be automatically closed after the file
  1601. data has been written.
  1602. .. versionchanged:: 1.0.0
  1603. Previously this was "fname"
  1604. convert_dates : dict
  1605. Dictionary mapping columns containing datetime types to stata
  1606. internal format to use when writing the dates. Options are 'tc',
  1607. 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
  1608. or a name. Datetime columns that do not have a conversion type
  1609. specified will be converted to 'tc'. Raises NotImplementedError if
  1610. a datetime column has timezone information.
  1611. write_index : bool
  1612. Write the index to Stata dataset.
  1613. byteorder : str
  1614. Can be ">", "<", "little", or "big". default is `sys.byteorder`.
  1615. time_stamp : datetime
  1616. A datetime to use as file creation date. Default is the current
  1617. time.
  1618. data_label : str, optional
  1619. A label for the data set. Must be 80 characters or smaller.
  1620. variable_labels : dict
  1621. Dictionary containing columns as keys and variable labels as
  1622. values. Each label must be 80 characters or smaller.
  1623. version : {114, 117, 118, 119, None}, default 114
  1624. Version to use in the output dta file. Set to None to let pandas
  1625. decide between 118 or 119 formats depending on the number of
  1626. columns in the frame. Version 114 can be read by Stata 10 and
  1627. later. Version 117 can be read by Stata 13 or later. Version 118
  1628. is supported in Stata 14 and later. Version 119 is supported in
  1629. Stata 15 and later. Version 114 limits string variables to 244
  1630. characters or fewer while versions 117 and later allow strings
  1631. with lengths up to 2,000,000 characters. Versions 118 and 119
  1632. support Unicode characters, and version 119 supports more than
  1633. 32,767 variables.
  1634. .. versionadded:: 0.23.0
  1635. .. versionchanged:: 1.0.0
  1636. Added support for formats 118 and 119.
  1637. convert_strl : list, optional
  1638. List of column names to convert to string columns to Stata StrL
  1639. format. Only available if version is 117. Storing strings in the
  1640. StrL format can produce smaller dta files if strings have more than
  1641. 8 characters and values are repeated.
  1642. .. versionadded:: 0.23.0
  1643. Raises
  1644. ------
  1645. NotImplementedError
  1646. * If datetimes contain timezone information
  1647. * Column dtype is not representable in Stata
  1648. ValueError
  1649. * Columns listed in convert_dates are neither datetime64[ns]
  1650. or datetime.datetime
  1651. * Column listed in convert_dates is not in DataFrame
  1652. * Categorical label contains more than 32,000 characters
  1653. See Also
  1654. --------
  1655. read_stata : Import Stata data files.
  1656. io.stata.StataWriter : Low-level writer for Stata data files.
  1657. io.stata.StataWriter117 : Low-level writer for version 117 files.
  1658. Examples
  1659. --------
  1660. >>> df = pd.DataFrame({'animal': ['falcon', 'parrot', 'falcon',
  1661. ... 'parrot'],
  1662. ... 'speed': [350, 18, 361, 15]})
  1663. >>> df.to_stata('animals.dta') # doctest: +SKIP
  1664. """
  1665. if version not in (114, 117, 118, 119, None):
  1666. raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
  1667. if version == 114:
  1668. if convert_strl is not None:
  1669. raise ValueError("strl is not supported in format 114")
  1670. from pandas.io.stata import StataWriter as statawriter
  1671. elif version == 117:
  1672. from pandas.io.stata import StataWriter117 as statawriter
  1673. else: # versions 118 and 119
  1674. from pandas.io.stata import StataWriterUTF8 as statawriter
  1675. kwargs = {}
  1676. if version is None or version >= 117:
  1677. # strl conversion is only supported >= 117
  1678. kwargs["convert_strl"] = convert_strl
  1679. if version is None or version >= 118:
  1680. # Specifying the version is only supported for UTF8 (118 or 119)
  1681. kwargs["version"] = version
  1682. writer = statawriter(
  1683. path,
  1684. self,
  1685. convert_dates=convert_dates,
  1686. byteorder=byteorder,
  1687. time_stamp=time_stamp,
  1688. data_label=data_label,
  1689. write_index=write_index,
  1690. variable_labels=variable_labels,
  1691. **kwargs,
  1692. )
  1693. writer.write_file()
  1694. @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
  1695. def to_feather(self, path) -> None:
  1696. """
  1697. Write out the binary feather-format for DataFrames.
  1698. Parameters
  1699. ----------
  1700. path : str
  1701. String file path.
  1702. """
  1703. from pandas.io.feather_format import to_feather
  1704. to_feather(self, path)
  1705. @Appender(
  1706. """
  1707. Examples
  1708. --------
  1709. >>> df = pd.DataFrame(
  1710. ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
  1711. ... )
  1712. >>> print(df.to_markdown())
  1713. | | animal_1 | animal_2 |
  1714. |---:|:-----------|:-----------|
  1715. | 0 | elk | dog |
  1716. | 1 | pig | quetzal |
  1717. """
  1718. )
  1719. @Substitution(klass="DataFrame")
  1720. @Appender(_shared_docs["to_markdown"])
  1721. def to_markdown(
  1722. self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs
  1723. ) -> Optional[str]:
  1724. kwargs.setdefault("headers", "keys")
  1725. kwargs.setdefault("tablefmt", "pipe")
  1726. tabulate = import_optional_dependency("tabulate")
  1727. result = tabulate.tabulate(self, **kwargs)
  1728. if buf is None:
  1729. return result
  1730. buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode)
  1731. assert buf is not None # Help mypy.
  1732. buf.writelines(result)
  1733. return None
  1734. @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
  1735. def to_parquet(
  1736. self,
  1737. path,
  1738. engine="auto",
  1739. compression="snappy",
  1740. index=None,
  1741. partition_cols=None,
  1742. **kwargs,
  1743. ) -> None:
  1744. """
  1745. Write a DataFrame to the binary parquet format.
  1746. .. versionadded:: 0.21.0
  1747. This function writes the dataframe as a `parquet file
  1748. <https://parquet.apache.org/>`_. You can choose different parquet
  1749. backends, and have the option of compression. See
  1750. :ref:`the user guide <io.parquet>` for more details.
  1751. Parameters
  1752. ----------
  1753. path : str
  1754. File path or Root Directory path. Will be used as Root Directory
  1755. path while writing a partitioned dataset.
  1756. .. versionchanged:: 1.0.0
  1757. Previously this was "fname"
  1758. engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
  1759. Parquet library to use. If 'auto', then the option
  1760. ``io.parquet.engine`` is used. The default ``io.parquet.engine``
  1761. behavior is to try 'pyarrow', falling back to 'fastparquet' if
  1762. 'pyarrow' is unavailable.
  1763. compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
  1764. Name of the compression to use. Use ``None`` for no compression.
  1765. index : bool, default None
  1766. If ``True``, include the dataframe's index(es) in the file output.
  1767. If ``False``, they will not be written to the file.
  1768. If ``None``, similar to ``True`` the dataframe's index(es)
  1769. will be saved. However, instead of being saved as values,
  1770. the RangeIndex will be stored as a range in the metadata so it
  1771. doesn't require much space and is faster. Other indexes will
  1772. be included as columns in the file output.
  1773. .. versionadded:: 0.24.0
  1774. partition_cols : list, optional, default None
  1775. Column names by which to partition the dataset.
  1776. Columns are partitioned in the order they are given.
  1777. .. versionadded:: 0.24.0
  1778. **kwargs
  1779. Additional arguments passed to the parquet library. See
  1780. :ref:`pandas io <io.parquet>` for more details.
  1781. See Also
  1782. --------
  1783. read_parquet : Read a parquet file.
  1784. DataFrame.to_csv : Write a csv file.
  1785. DataFrame.to_sql : Write to a sql table.
  1786. DataFrame.to_hdf : Write to hdf.
  1787. Notes
  1788. -----
  1789. This function requires either the `fastparquet
  1790. <https://pypi.org/project/fastparquet>`_ or `pyarrow
  1791. <https://arrow.apache.org/docs/python/>`_ library.
  1792. Examples
  1793. --------
  1794. >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
  1795. >>> df.to_parquet('df.parquet.gzip',
  1796. ... compression='gzip') # doctest: +SKIP
  1797. >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
  1798. col1 col2
  1799. 0 1 3
  1800. 1 2 4
  1801. """
  1802. from pandas.io.parquet import to_parquet
  1803. to_parquet(
  1804. self,
  1805. path,
  1806. engine,
  1807. compression=compression,
  1808. index=index,
  1809. partition_cols=partition_cols,
  1810. **kwargs,
  1811. )
  1812. @Substitution(
  1813. header_type="bool",
  1814. header="Whether to print column labels, default True",
  1815. col_space_type="str or int",
  1816. col_space="The minimum width of each column in CSS length "
  1817. "units. An int is assumed to be px units.\n\n"
  1818. " .. versionadded:: 0.25.0\n"
  1819. " Ability to use str",
  1820. )
  1821. @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
  1822. def to_html(
  1823. self,
  1824. buf=None,
  1825. columns=None,
  1826. col_space=None,
  1827. header=True,
  1828. index=True,
  1829. na_rep="NaN",
  1830. formatters=None,
  1831. float_format=None,
  1832. sparsify=None,
  1833. index_names=True,
  1834. justify=None,
  1835. max_rows=None,
  1836. max_cols=None,
  1837. show_dimensions=False,
  1838. decimal=".",
  1839. bold_rows=True,
  1840. classes=None,
  1841. escape=True,
  1842. notebook=False,
  1843. border=None,
  1844. table_id=None,
  1845. render_links=False,
  1846. encoding=None,
  1847. ):
  1848. """
  1849. Render a DataFrame as an HTML table.
  1850. %(shared_params)s
  1851. bold_rows : bool, default True
  1852. Make the row labels bold in the output.
  1853. classes : str or list or tuple, default None
  1854. CSS class(es) to apply to the resulting html table.
  1855. escape : bool, default True
  1856. Convert the characters <, >, and & to HTML-safe sequences.
  1857. notebook : {True, False}, default False
  1858. Whether the generated HTML is for IPython Notebook.
  1859. border : int
  1860. A ``border=border`` attribute is included in the opening
  1861. `<table>` tag. Default ``pd.options.display.html.border``.
  1862. encoding : str, default "utf-8"
  1863. Set character encoding.
  1864. .. versionadded:: 1.0
  1865. table_id : str, optional
  1866. A css id is included in the opening `<table>` tag if specified.
  1867. .. versionadded:: 0.23.0
  1868. render_links : bool, default False
  1869. Convert URLs to HTML links.
  1870. .. versionadded:: 0.24.0
  1871. %(returns)s
  1872. See Also
  1873. --------
  1874. to_string : Convert DataFrame to a string.
  1875. """
  1876. if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
  1877. raise ValueError("Invalid value for justify parameter")
  1878. formatter = fmt.DataFrameFormatter(
  1879. self,
  1880. columns=columns,
  1881. col_space=col_space,
  1882. na_rep=na_rep,
  1883. formatters=formatters,
  1884. float_format=float_format,
  1885. sparsify=sparsify,
  1886. justify=justify,
  1887. index_names=index_names,
  1888. header=header,
  1889. index=index,
  1890. bold_rows=bold_rows,
  1891. escape=escape,
  1892. max_rows=max_rows,
  1893. max_cols=max_cols,
  1894. show_dimensions=show_dimensions,
  1895. decimal=decimal,
  1896. table_id=table_id,
  1897. render_links=render_links,
  1898. )
  1899. # TODO: a generic formatter wld b in DataFrameFormatter
  1900. return formatter.to_html(
  1901. buf=buf,
  1902. classes=classes,
  1903. notebook=notebook,
  1904. border=border,
  1905. encoding=encoding,
  1906. )
  1907. # ----------------------------------------------------------------------
  1908. def info(
  1909. self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None
  1910. ) -> None:
  1911. """
  1912. Print a concise summary of a DataFrame.
  1913. This method prints information about a DataFrame including
  1914. the index dtype and column dtypes, non-null values and memory usage.
  1915. Parameters
  1916. ----------
  1917. verbose : bool, optional
  1918. Whether to print the full summary. By default, the setting in
  1919. ``pandas.options.display.max_info_columns`` is followed.
  1920. buf : writable buffer, defaults to sys.stdout
  1921. Where to send the output. By default, the output is printed to
  1922. sys.stdout. Pass a writable buffer if you need to further process
  1923. the output.
  1924. max_cols : int, optional
  1925. When to switch from the verbose to the truncated output. If the
  1926. DataFrame has more than `max_cols` columns, the truncated output
  1927. is used. By default, the setting in
  1928. ``pandas.options.display.max_info_columns`` is used.
  1929. memory_usage : bool, str, optional
  1930. Specifies whether total memory usage of the DataFrame
  1931. elements (including the index) should be displayed. By default,
  1932. this follows the ``pandas.options.display.memory_usage`` setting.
  1933. True always show memory usage. False never shows memory usage.
  1934. A value of 'deep' is equivalent to "True with deep introspection".
  1935. Memory usage is shown in human-readable units (base-2
  1936. representation). Without deep introspection a memory estimation is
  1937. made based in column dtype and number of rows assuming values
  1938. consume the same memory amount for corresponding dtypes. With deep
  1939. memory introspection, a real memory usage calculation is performed
  1940. at the cost of computational resources.
  1941. null_counts : bool, optional
  1942. Whether to show the non-null counts. By default, this is shown
  1943. only if the frame is smaller than
  1944. ``pandas.options.display.max_info_rows`` and
  1945. ``pandas.options.display.max_info_columns``. A value of True always
  1946. shows the counts, and False never shows the counts.
  1947. Returns
  1948. -------
  1949. None
  1950. This method prints a summary of a DataFrame and returns None.
  1951. See Also
  1952. --------
  1953. DataFrame.describe: Generate descriptive statistics of DataFrame
  1954. columns.
  1955. DataFrame.memory_usage: Memory usage of DataFrame columns.
  1956. Examples
  1957. --------
  1958. >>> int_values = [1, 2, 3, 4, 5]
  1959. >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
  1960. >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
  1961. >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
  1962. ... "float_col": float_values})
  1963. >>> df
  1964. int_col text_col float_col
  1965. 0 1 alpha 0.00
  1966. 1 2 beta 0.25
  1967. 2 3 gamma 0.50
  1968. 3 4 delta 0.75
  1969. 4 5 epsilon 1.00
  1970. Prints information of all columns:
  1971. >>> df.info(verbose=True)
  1972. <class 'pandas.core.frame.DataFrame'>
  1973. RangeIndex: 5 entries, 0 to 4
  1974. Data columns (total 3 columns):
  1975. # Column Non-Null Count Dtype
  1976. --- ------ -------------- -----
  1977. 0 int_col 5 non-null int64
  1978. 1 text_col 5 non-null object
  1979. 2 float_col 5 non-null float64
  1980. dtypes: float64(1), int64(1), object(1)
  1981. memory usage: 248.0+ bytes
  1982. Prints a summary of columns count and its dtypes but not per column
  1983. information:
  1984. >>> df.info(verbose=False)
  1985. <class 'pandas.core.frame.DataFrame'>
  1986. RangeIndex: 5 entries, 0 to 4
  1987. Columns: 3 entries, int_col to float_col
  1988. dtypes: float64(1), int64(1), object(1)
  1989. memory usage: 248.0+ bytes
  1990. Pipe output of DataFrame.info to buffer instead of sys.stdout, get
  1991. buffer content and writes to a text file:
  1992. >>> import io
  1993. >>> buffer = io.StringIO()
  1994. >>> df.info(buf=buffer)
  1995. >>> s = buffer.getvalue()
  1996. >>> with open("df_info.txt", "w",
  1997. ... encoding="utf-8") as f: # doctest: +SKIP
  1998. ... f.write(s)
  1999. 260
  2000. The `memory_usage` parameter allows deep introspection mode, specially
  2001. useful for big DataFrames and fine-tune memory optimization:
  2002. >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
  2003. >>> df = pd.DataFrame({
  2004. ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  2005. ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
  2006. ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
  2007. ... })
  2008. >>> df.info()
  2009. <class 'pandas.core.frame.DataFrame'>
  2010. RangeIndex: 1000000 entries, 0 to 999999
  2011. Data columns (total 3 columns):
  2012. # Column Non-Null Count Dtype
  2013. --- ------ -------------- -----
  2014. 0 column_1 1000000 non-null object
  2015. 1 column_2 1000000 non-null object
  2016. 2 column_3 1000000 non-null object
  2017. dtypes: object(3)
  2018. memory usage: 22.9+ MB
  2019. >>> df.info(memory_usage='deep')
  2020. <class 'pandas.core.frame.DataFrame'>
  2021. RangeIndex: 1000000 entries, 0 to 999999
  2022. Data columns (total 3 columns):
  2023. # Column Non-Null Count Dtype
  2024. --- ------ -------------- -----
  2025. 0 column_1 1000000 non-null object
  2026. 1 column_2 1000000 non-null object
  2027. 2 column_3 1000000 non-null object
  2028. dtypes: object(3)
  2029. memory usage: 188.8 MB
  2030. """
  2031. if buf is None: # pragma: no cover
  2032. buf = sys.stdout
  2033. lines = []
  2034. lines.append(str(type(self)))
  2035. lines.append(self.index._summary())
  2036. if len(self.columns) == 0:
  2037. lines.append(f"Empty {type(self).__name__}")
  2038. fmt.buffer_put_lines(buf, lines)
  2039. return
  2040. cols = self.columns
  2041. col_count = len(self.columns)
  2042. # hack
  2043. if max_cols is None:
  2044. max_cols = get_option("display.max_info_columns", len(self.columns) + 1)
  2045. max_rows = get_option("display.max_info_rows", len(self) + 1)
  2046. if null_counts is None:
  2047. show_counts = (col_count <= max_cols) and (len(self) < max_rows)
  2048. else:
  2049. show_counts = null_counts
  2050. exceeds_info_cols = col_count > max_cols
  2051. def _verbose_repr():
  2052. lines.append(f"Data columns (total {len(self.columns)} columns):")
  2053. id_head = " # "
  2054. column_head = "Column"
  2055. col_space = 2
  2056. max_col = max(len(pprint_thing(k)) for k in cols)
  2057. len_column = len(pprint_thing(column_head))
  2058. space = max(max_col, len_column) + col_space
  2059. max_id = len(pprint_thing(col_count))
  2060. len_id = len(pprint_thing(id_head))
  2061. space_num = max(max_id, len_id) + col_space
  2062. counts = None
  2063. header = _put_str(id_head, space_num) + _put_str(column_head, space)
  2064. if show_counts:
  2065. counts = self.count()
  2066. if len(cols) != len(counts): # pragma: no cover
  2067. raise AssertionError(
  2068. f"Columns must equal counts ({len(cols)} != {len(counts)})"
  2069. )
  2070. count_header = "Non-Null Count"
  2071. len_count = len(count_header)
  2072. non_null = " non-null"
  2073. max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null)
  2074. space_count = max(len_count, max_count) + col_space
  2075. count_temp = "{count}" + non_null
  2076. else:
  2077. count_header = ""
  2078. space_count = len(count_header)
  2079. len_count = space_count
  2080. count_temp = "{count}"
  2081. dtype_header = "Dtype"
  2082. len_dtype = len(dtype_header)
  2083. max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes)
  2084. space_dtype = max(len_dtype, max_dtypes)
  2085. header += _put_str(count_header, space_count) + _put_str(
  2086. dtype_header, space_dtype
  2087. )
  2088. lines.append(header)
  2089. lines.append(
  2090. _put_str("-" * len_id, space_num)
  2091. + _put_str("-" * len_column, space)
  2092. + _put_str("-" * len_count, space_count)
  2093. + _put_str("-" * len_dtype, space_dtype)
  2094. )
  2095. for i, col in enumerate(self.columns):
  2096. dtype = self.dtypes.iloc[i]
  2097. col = pprint_thing(col)
  2098. line_no = _put_str(" {num}".format(num=i), space_num)
  2099. count = ""
  2100. if show_counts:
  2101. count = counts.iloc[i]
  2102. lines.append(
  2103. line_no
  2104. + _put_str(col, space)
  2105. + _put_str(count_temp.format(count=count), space_count)
  2106. + _put_str(dtype, space_dtype)
  2107. )
  2108. def _non_verbose_repr():
  2109. lines.append(self.columns._summary(name="Columns"))
  2110. def _sizeof_fmt(num, size_qualifier):
  2111. # returns size in human readable format
  2112. for x in ["bytes", "KB", "MB", "GB", "TB"]:
  2113. if num < 1024.0:
  2114. return f"{num:3.1f}{size_qualifier} {x}"
  2115. num /= 1024.0
  2116. return f"{num:3.1f}{size_qualifier} PB"
  2117. if verbose:
  2118. _verbose_repr()
  2119. elif verbose is False: # specifically set to False, not nesc None
  2120. _non_verbose_repr()
  2121. else:
  2122. if exceeds_info_cols:
  2123. _non_verbose_repr()
  2124. else:
  2125. _verbose_repr()
  2126. counts = self._data.get_dtype_counts()
  2127. dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())]
  2128. lines.append(f"dtypes: {', '.join(dtypes)}")
  2129. if memory_usage is None:
  2130. memory_usage = get_option("display.memory_usage")
  2131. if memory_usage:
  2132. # append memory usage of df to display
  2133. size_qualifier = ""
  2134. if memory_usage == "deep":
  2135. deep = True
  2136. else:
  2137. # size_qualifier is just a best effort; not guaranteed to catch
  2138. # all cases (e.g., it misses categorical data even with object
  2139. # categories)
  2140. deep = False
  2141. if "object" in counts or self.index._is_memory_usage_qualified():
  2142. size_qualifier = "+"
  2143. mem_usage = self.memory_usage(index=True, deep=deep).sum()
  2144. lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n")
  2145. fmt.buffer_put_lines(buf, lines)
  2146. def memory_usage(self, index=True, deep=False) -> Series:
  2147. """
  2148. Return the memory usage of each column in bytes.
  2149. The memory usage can optionally include the contribution of
  2150. the index and elements of `object` dtype.
  2151. This value is displayed in `DataFrame.info` by default. This can be
  2152. suppressed by setting ``pandas.options.display.memory_usage`` to False.
  2153. Parameters
  2154. ----------
  2155. index : bool, default True
  2156. Specifies whether to include the memory usage of the DataFrame's
  2157. index in returned Series. If ``index=True``, the memory usage of
  2158. the index is the first item in the output.
  2159. deep : bool, default False
  2160. If True, introspect the data deeply by interrogating
  2161. `object` dtypes for system-level memory consumption, and include
  2162. it in the returned values.
  2163. Returns
  2164. -------
  2165. Series
  2166. A Series whose index is the original column names and whose values
  2167. is the memory usage of each column in bytes.
  2168. See Also
  2169. --------
  2170. numpy.ndarray.nbytes : Total bytes consumed by the elements of an
  2171. ndarray.
  2172. Series.memory_usage : Bytes consumed by a Series.
  2173. Categorical : Memory-efficient array for string values with
  2174. many repeated values.
  2175. DataFrame.info : Concise summary of a DataFrame.
  2176. Examples
  2177. --------
  2178. >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
  2179. >>> data = dict([(t, np.ones(shape=5000).astype(t))
  2180. ... for t in dtypes])
  2181. >>> df = pd.DataFrame(data)
  2182. >>> df.head()
  2183. int64 float64 complex128 object bool
  2184. 0 1 1.0 1.000000+0.000000j 1 True
  2185. 1 1 1.0 1.000000+0.000000j 1 True
  2186. 2 1 1.0 1.000000+0.000000j 1 True
  2187. 3 1 1.0 1.000000+0.000000j 1 True
  2188. 4 1 1.0 1.000000+0.000000j 1 True
  2189. >>> df.memory_usage()
  2190. Index 128
  2191. int64 40000
  2192. float64 40000
  2193. complex128 80000
  2194. object 40000
  2195. bool 5000
  2196. dtype: int64
  2197. >>> df.memory_usage(index=False)
  2198. int64 40000
  2199. float64 40000
  2200. complex128 80000
  2201. object 40000
  2202. bool 5000
  2203. dtype: int64
  2204. The memory footprint of `object` dtype columns is ignored by default:
  2205. >>> df.memory_usage(deep=True)
  2206. Index 128
  2207. int64 40000
  2208. float64 40000
  2209. complex128 80000
  2210. object 160000
  2211. bool 5000
  2212. dtype: int64
  2213. Use a Categorical for efficient storage of an object-dtype column with
  2214. many repeated values.
  2215. >>> df['object'].astype('category').memory_usage(deep=True)
  2216. 5216
  2217. """
  2218. result = Series(
  2219. [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
  2220. index=self.columns,
  2221. )
  2222. if index:
  2223. result = Series(self.index.memory_usage(deep=deep), index=["Index"]).append(
  2224. result
  2225. )
  2226. return result
  2227. def transpose(self, *args, copy: bool = False) -> "DataFrame":
  2228. """
  2229. Transpose index and columns.
  2230. Reflect the DataFrame over its main diagonal by writing rows as columns
  2231. and vice-versa. The property :attr:`.T` is an accessor to the method
  2232. :meth:`transpose`.
  2233. Parameters
  2234. ----------
  2235. *args : tuple, optional
  2236. Accepted for compatibility with NumPy.
  2237. copy : bool, default False
  2238. Whether to copy the data after transposing, even for DataFrames
  2239. with a single dtype.
  2240. Note that a copy is always required for mixed dtype DataFrames,
  2241. or for DataFrames with any extension types.
  2242. Returns
  2243. -------
  2244. DataFrame
  2245. The transposed DataFrame.
  2246. See Also
  2247. --------
  2248. numpy.transpose : Permute the dimensions of a given array.
  2249. Notes
  2250. -----
  2251. Transposing a DataFrame with mixed dtypes will result in a homogeneous
  2252. DataFrame with the `object` dtype. In such a case, a copy of the data
  2253. is always made.
  2254. Examples
  2255. --------
  2256. **Square DataFrame with homogeneous dtype**
  2257. >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
  2258. >>> df1 = pd.DataFrame(data=d1)
  2259. >>> df1
  2260. col1 col2
  2261. 0 1 3
  2262. 1 2 4
  2263. >>> df1_transposed = df1.T # or df1.transpose()
  2264. >>> df1_transposed
  2265. 0 1
  2266. col1 1 2
  2267. col2 3 4
  2268. When the dtype is homogeneous in the original DataFrame, we get a
  2269. transposed DataFrame with the same dtype:
  2270. >>> df1.dtypes
  2271. col1 int64
  2272. col2 int64
  2273. dtype: object
  2274. >>> df1_transposed.dtypes
  2275. 0 int64
  2276. 1 int64
  2277. dtype: object
  2278. **Non-square DataFrame with mixed dtypes**
  2279. >>> d2 = {'name': ['Alice', 'Bob'],
  2280. ... 'score': [9.5, 8],
  2281. ... 'employed': [False, True],
  2282. ... 'kids': [0, 0]}
  2283. >>> df2 = pd.DataFrame(data=d2)
  2284. >>> df2
  2285. name score employed kids
  2286. 0 Alice 9.5 False 0
  2287. 1 Bob 8.0 True 0
  2288. >>> df2_transposed = df2.T # or df2.transpose()
  2289. >>> df2_transposed
  2290. 0 1
  2291. name Alice Bob
  2292. score 9.5 8
  2293. employed False True
  2294. kids 0 0
  2295. When the DataFrame has mixed dtypes, we get a transposed DataFrame with
  2296. the `object` dtype:
  2297. >>> df2.dtypes
  2298. name object
  2299. score float64
  2300. employed bool
  2301. kids int64
  2302. dtype: object
  2303. >>> df2_transposed.dtypes
  2304. 0 object
  2305. 1 object
  2306. dtype: object
  2307. """
  2308. nv.validate_transpose(args, dict())
  2309. # construct the args
  2310. dtypes = list(self.dtypes)
  2311. if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]):
  2312. # We have EAs with the same dtype. We can preserve that dtype in transpose.
  2313. dtype = dtypes[0]
  2314. arr_type = dtype.construct_array_type()
  2315. values = self.values
  2316. new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
  2317. result = self._constructor(
  2318. dict(zip(self.index, new_values)), index=self.columns
  2319. )
  2320. else:
  2321. new_values = self.values.T
  2322. if copy:
  2323. new_values = new_values.copy()
  2324. result = self._constructor(
  2325. new_values, index=self.columns, columns=self.index
  2326. )
  2327. return result.__finalize__(self)
  2328. T = property(transpose)
  2329. # ----------------------------------------------------------------------
  2330. # Indexing Methods
  2331. def _ixs(self, i: int, axis: int = 0):
  2332. """
  2333. Parameters
  2334. ----------
  2335. i : int
  2336. axis : int
  2337. Notes
  2338. -----
  2339. If slice passed, the resulting data will be a view.
  2340. """
  2341. # irow
  2342. if axis == 0:
  2343. new_values = self._data.fast_xs(i)
  2344. # if we are a copy, mark as such
  2345. copy = isinstance(new_values, np.ndarray) and new_values.base is None
  2346. result = self._constructor_sliced(
  2347. new_values,
  2348. index=self.columns,
  2349. name=self.index[i],
  2350. dtype=new_values.dtype,
  2351. )
  2352. result._set_is_copy(self, copy=copy)
  2353. return result
  2354. # icol
  2355. else:
  2356. label = self.columns[i]
  2357. # if the values returned are not the same length
  2358. # as the index (iow a not found value), iget returns
  2359. # a 0-len ndarray. This is effectively catching
  2360. # a numpy error (as numpy should really raise)
  2361. values = self._data.iget(i)
  2362. if len(self.index) and not len(values):
  2363. values = np.array([np.nan] * len(self.index), dtype=object)
  2364. result = self._box_col_values(values, label)
  2365. # this is a cached value, mark it so
  2366. result._set_as_cached(label, self)
  2367. return result
  2368. def __getitem__(self, key):
  2369. key = lib.item_from_zerodim(key)
  2370. key = com.apply_if_callable(key, self)
  2371. if is_hashable(key):
  2372. # shortcut if the key is in columns
  2373. if self.columns.is_unique and key in self.columns:
  2374. if self.columns.nlevels > 1:
  2375. return self._getitem_multilevel(key)
  2376. return self._get_item_cache(key)
  2377. # Do we have a slicer (on rows)?
  2378. indexer = convert_to_index_sliceable(self, key)
  2379. if indexer is not None:
  2380. # either we have a slice or we have a string that can be converted
  2381. # to a slice for partial-string date indexing
  2382. return self._slice(indexer, axis=0)
  2383. # Do we have a (boolean) DataFrame?
  2384. if isinstance(key, DataFrame):
  2385. return self.where(key)
  2386. # Do we have a (boolean) 1d indexer?
  2387. if com.is_bool_indexer(key):
  2388. return self._getitem_bool_array(key)
  2389. # We are left with two options: a single key, and a collection of keys,
  2390. # We interpret tuples as collections only for non-MultiIndex
  2391. is_single_key = isinstance(key, tuple) or not is_list_like(key)
  2392. if is_single_key:
  2393. if self.columns.nlevels > 1:
  2394. return self._getitem_multilevel(key)
  2395. indexer = self.columns.get_loc(key)
  2396. if is_integer(indexer):
  2397. indexer = [indexer]
  2398. else:
  2399. if is_iterator(key):
  2400. key = list(key)
  2401. indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
  2402. # take() does not accept boolean indexers
  2403. if getattr(indexer, "dtype", None) == bool:
  2404. indexer = np.where(indexer)[0]
  2405. data = self._take_with_is_copy(indexer, axis=1)
  2406. if is_single_key:
  2407. # What does looking for a single key in a non-unique index return?
  2408. # The behavior is inconsistent. It returns a Series, except when
  2409. # - the key itself is repeated (test on data.shape, #9519), or
  2410. # - we have a MultiIndex on columns (test on self.columns, #21309)
  2411. if data.shape[1] == 1 and not isinstance(self.columns, ABCMultiIndex):
  2412. data = data[key]
  2413. return data
  2414. def _getitem_bool_array(self, key):
  2415. # also raises Exception if object array with NA values
  2416. # warning here just in case -- previously __setitem__ was
  2417. # reindexing but __getitem__ was not; it seems more reasonable to
  2418. # go with the __setitem__ behavior since that is more consistent
  2419. # with all other indexing behavior
  2420. if isinstance(key, Series) and not key.index.equals(self.index):
  2421. warnings.warn(
  2422. "Boolean Series key will be reindexed to match DataFrame index.",
  2423. UserWarning,
  2424. stacklevel=3,
  2425. )
  2426. elif len(key) != len(self.index):
  2427. raise ValueError(
  2428. f"Item wrong length {len(key)} instead of {len(self.index)}."
  2429. )
  2430. # check_bool_indexer will throw exception if Series key cannot
  2431. # be reindexed to match DataFrame rows
  2432. key = check_bool_indexer(self.index, key)
  2433. indexer = key.nonzero()[0]
  2434. return self._take_with_is_copy(indexer, axis=0)
  2435. def _getitem_multilevel(self, key):
  2436. # self.columns is a MultiIndex
  2437. loc = self.columns.get_loc(key)
  2438. if isinstance(loc, (slice, Series, np.ndarray, Index)):
  2439. new_columns = self.columns[loc]
  2440. result_columns = maybe_droplevels(new_columns, key)
  2441. if self._is_mixed_type:
  2442. result = self.reindex(columns=new_columns)
  2443. result.columns = result_columns
  2444. else:
  2445. new_values = self.values[:, loc]
  2446. result = self._constructor(
  2447. new_values, index=self.index, columns=result_columns
  2448. )
  2449. result = result.__finalize__(self)
  2450. # If there is only one column being returned, and its name is
  2451. # either an empty string, or a tuple with an empty string as its
  2452. # first element, then treat the empty string as a placeholder
  2453. # and return the column as if the user had provided that empty
  2454. # string in the key. If the result is a Series, exclude the
  2455. # implied empty string from its name.
  2456. if len(result.columns) == 1:
  2457. top = result.columns[0]
  2458. if isinstance(top, tuple):
  2459. top = top[0]
  2460. if top == "":
  2461. result = result[""]
  2462. if isinstance(result, Series):
  2463. result = self._constructor_sliced(
  2464. result, index=self.index, name=key
  2465. )
  2466. result._set_is_copy(self)
  2467. return result
  2468. else:
  2469. return self._get_item_cache(key)
  2470. def _get_value(self, index, col, takeable: bool = False):
  2471. """
  2472. Quickly retrieve single value at passed column and index.
  2473. Parameters
  2474. ----------
  2475. index : row label
  2476. col : column label
  2477. takeable : interpret the index/col as indexers, default False
  2478. Returns
  2479. -------
  2480. scalar
  2481. """
  2482. if takeable:
  2483. series = self._iget_item_cache(col)
  2484. return com.maybe_box_datetimelike(series._values[index])
  2485. series = self._get_item_cache(col)
  2486. engine = self.index._engine
  2487. try:
  2488. return engine.get_value(series._values, index)
  2489. except KeyError:
  2490. # GH 20629
  2491. if self.index.nlevels > 1:
  2492. # partial indexing forbidden
  2493. raise
  2494. except (TypeError, ValueError):
  2495. pass
  2496. # we cannot handle direct indexing
  2497. # use positional
  2498. col = self.columns.get_loc(col)
  2499. index = self.index.get_loc(index)
  2500. return self._get_value(index, col, takeable=True)
  2501. def __setitem__(self, key, value):
  2502. key = com.apply_if_callable(key, self)
  2503. # see if we can slice the rows
  2504. indexer = convert_to_index_sliceable(self, key)
  2505. if indexer is not None:
  2506. # either we have a slice or we have a string that can be converted
  2507. # to a slice for partial-string date indexing
  2508. return self._setitem_slice(indexer, value)
  2509. if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
  2510. self._setitem_frame(key, value)
  2511. elif isinstance(key, (Series, np.ndarray, list, Index)):
  2512. self._setitem_array(key, value)
  2513. else:
  2514. # set column
  2515. self._set_item(key, value)
  2516. def _setitem_slice(self, key, value):
  2517. # NB: we can't just use self.loc[key] = value because that
  2518. # operates on labels and we need to operate positional for
  2519. # backwards-compat, xref GH#31469
  2520. self._check_setitem_copy()
  2521. self.loc._setitem_with_indexer(key, value)
  2522. def _setitem_array(self, key, value):
  2523. # also raises Exception if object array with NA values
  2524. if com.is_bool_indexer(key):
  2525. if len(key) != len(self.index):
  2526. raise ValueError(
  2527. f"Item wrong length {len(key)} instead of {len(self.index)}!"
  2528. )
  2529. key = check_bool_indexer(self.index, key)
  2530. indexer = key.nonzero()[0]
  2531. self._check_setitem_copy()
  2532. self.loc._setitem_with_indexer(indexer, value)
  2533. else:
  2534. if isinstance(value, DataFrame):
  2535. if len(value.columns) != len(key):
  2536. raise ValueError("Columns must be same length as key")
  2537. for k1, k2 in zip(key, value.columns):
  2538. self[k1] = value[k2]
  2539. else:
  2540. indexer = self.loc._get_listlike_indexer(
  2541. key, axis=1, raise_missing=False
  2542. )[1]
  2543. self._check_setitem_copy()
  2544. self.loc._setitem_with_indexer((slice(None), indexer), value)
  2545. def _setitem_frame(self, key, value):
  2546. # support boolean setting with DataFrame input, e.g.
  2547. # df[df > df2] = 0
  2548. if isinstance(key, np.ndarray):
  2549. if key.shape != self.shape:
  2550. raise ValueError("Array conditional must be same shape as self")
  2551. key = self._constructor(key, **self._construct_axes_dict())
  2552. if key.values.size and not is_bool_dtype(key.values):
  2553. raise TypeError(
  2554. "Must pass DataFrame or 2-d ndarray with boolean values only"
  2555. )
  2556. self._check_inplace_setting(value)
  2557. self._check_setitem_copy()
  2558. self._where(-key, value, inplace=True)
  2559. def _set_item(self, key, value):
  2560. """
  2561. Add series to DataFrame in specified column.
  2562. If series is a numpy-array (not a Series/TimeSeries), it must be the
  2563. same length as the DataFrames index or an error will be thrown.
  2564. Series/TimeSeries will be conformed to the DataFrames index to
  2565. ensure homogeneity.
  2566. """
  2567. self._ensure_valid_index(value)
  2568. value = self._sanitize_column(key, value)
  2569. NDFrame._set_item(self, key, value)
  2570. # check if we are modifying a copy
  2571. # try to set first as we want an invalid
  2572. # value exception to occur first
  2573. if len(self):
  2574. self._check_setitem_copy()
  2575. def _set_value(self, index, col, value, takeable: bool = False):
  2576. """
  2577. Put single value at passed column and index.
  2578. Parameters
  2579. ----------
  2580. index : row label
  2581. col : column label
  2582. value : scalar
  2583. takeable : interpret the index/col as indexers, default False
  2584. Returns
  2585. -------
  2586. DataFrame
  2587. If label pair is contained, will be reference to calling DataFrame,
  2588. otherwise a new object.
  2589. """
  2590. try:
  2591. if takeable is True:
  2592. series = self._iget_item_cache(col)
  2593. return series._set_value(index, value, takeable=True)
  2594. series = self._get_item_cache(col)
  2595. engine = self.index._engine
  2596. engine.set_value(series._values, index, value)
  2597. return self
  2598. except (KeyError, TypeError):
  2599. # set using a non-recursive method & reset the cache
  2600. if takeable:
  2601. self.iloc[index, col] = value
  2602. else:
  2603. self.loc[index, col] = value
  2604. self._item_cache.pop(col, None)
  2605. return self
  2606. def _ensure_valid_index(self, value):
  2607. """
  2608. Ensure that if we don't have an index, that we can create one from the
  2609. passed value.
  2610. """
  2611. # GH5632, make sure that we are a Series convertible
  2612. if not len(self.index) and is_list_like(value) and len(value):
  2613. try:
  2614. value = Series(value)
  2615. except (ValueError, NotImplementedError, TypeError):
  2616. raise ValueError(
  2617. "Cannot set a frame with no defined index "
  2618. "and a value that cannot be converted to a "
  2619. "Series"
  2620. )
  2621. self._data = self._data.reindex_axis(
  2622. value.index.copy(), axis=1, fill_value=np.nan
  2623. )
  2624. def _box_item_values(self, key, values):
  2625. items = self.columns[self.columns.get_loc(key)]
  2626. if values.ndim == 2:
  2627. return self._constructor(values.T, columns=items, index=self.index)
  2628. else:
  2629. return self._box_col_values(values, items)
  2630. def _box_col_values(self, values, items):
  2631. """
  2632. Provide boxed values for a column.
  2633. """
  2634. klass = self._constructor_sliced
  2635. return klass(values, index=self.index, name=items, fastpath=True)
  2636. # ----------------------------------------------------------------------
  2637. # Unsorted
  2638. def query(self, expr, inplace=False, **kwargs):
  2639. """
  2640. Query the columns of a DataFrame with a boolean expression.
  2641. Parameters
  2642. ----------
  2643. expr : str
  2644. The query string to evaluate.
  2645. You can refer to variables
  2646. in the environment by prefixing them with an '@' character like
  2647. ``@a + b``.
  2648. You can refer to column names that contain spaces or operators by
  2649. surrounding them in backticks. This way you can also escape
  2650. names that start with a digit, or those that are a Python keyword.
  2651. Basically when it is not valid Python identifier. See notes down
  2652. for more details.
  2653. For example, if one of your columns is called ``a a`` and you want
  2654. to sum it with ``b``, your query should be ```a a` + b``.
  2655. .. versionadded:: 0.25.0
  2656. Backtick quoting introduced.
  2657. .. versionadded:: 1.0.0
  2658. Expanding functionality of backtick quoting for more than only spaces.
  2659. inplace : bool
  2660. Whether the query should modify the data in place or return
  2661. a modified copy.
  2662. **kwargs
  2663. See the documentation for :func:`eval` for complete details
  2664. on the keyword arguments accepted by :meth:`DataFrame.query`.
  2665. Returns
  2666. -------
  2667. DataFrame
  2668. DataFrame resulting from the provided query expression.
  2669. See Also
  2670. --------
  2671. eval : Evaluate a string describing operations on
  2672. DataFrame columns.
  2673. DataFrame.eval : Evaluate a string describing operations on
  2674. DataFrame columns.
  2675. Notes
  2676. -----
  2677. The result of the evaluation of this expression is first passed to
  2678. :attr:`DataFrame.loc` and if that fails because of a
  2679. multidimensional key (e.g., a DataFrame) then the result will be passed
  2680. to :meth:`DataFrame.__getitem__`.
  2681. This method uses the top-level :func:`eval` function to
  2682. evaluate the passed query.
  2683. The :meth:`~pandas.DataFrame.query` method uses a slightly
  2684. modified Python syntax by default. For example, the ``&`` and ``|``
  2685. (bitwise) operators have the precedence of their boolean cousins,
  2686. :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
  2687. however the semantics are different.
  2688. You can change the semantics of the expression by passing the keyword
  2689. argument ``parser='python'``. This enforces the same semantics as
  2690. evaluation in Python space. Likewise, you can pass ``engine='python'``
  2691. to evaluate an expression using Python itself as a backend. This is not
  2692. recommended as it is inefficient compared to using ``numexpr`` as the
  2693. engine.
  2694. The :attr:`DataFrame.index` and
  2695. :attr:`DataFrame.columns` attributes of the
  2696. :class:`~pandas.DataFrame` instance are placed in the query namespace
  2697. by default, which allows you to treat both the index and columns of the
  2698. frame as a column in the frame.
  2699. The identifier ``index`` is used for the frame index; you can also
  2700. use the name of the index to identify it in a query. Please note that
  2701. Python keywords may not be used as identifiers.
  2702. For further details and examples see the ``query`` documentation in
  2703. :ref:`indexing <indexing.query>`.
  2704. *Backtick quoted variables*
  2705. Backtick quoted variables are parsed as literal Python code and
  2706. are converted internally to a Python valid identifier.
  2707. This can lead to the following problems.
  2708. During parsing a number of disallowed characters inside the backtick
  2709. quoted string are replaced by strings that are allowed as a Python identifier.
  2710. These characters include all operators in Python, the space character, the
  2711. question mark, the exclamation mark, the dollar sign, and the euro sign.
  2712. For other characters that fall outside the ASCII range (U+0001..U+007F)
  2713. and those that are not further specified in PEP 3131,
  2714. the query parser will raise an error.
  2715. This excludes whitespace different than the space character,
  2716. but also the hashtag (as it is used for comments) and the backtick
  2717. itself (backtick can also not be escaped).
  2718. In a special case, quotes that make a pair around a backtick can
  2719. confuse the parser.
  2720. For example, ```it's` > `that's``` will raise an error,
  2721. as it forms a quoted string (``'s > `that'``) with a backtick inside.
  2722. See also the Python documentation about lexical analysis
  2723. (https://docs.python.org/3/reference/lexical_analysis.html)
  2724. in combination with the source code in :mod:`pandas.core.computation.parsing`.
  2725. Examples
  2726. --------
  2727. >>> df = pd.DataFrame({'A': range(1, 6),
  2728. ... 'B': range(10, 0, -2),
  2729. ... 'C C': range(10, 5, -1)})
  2730. >>> df
  2731. A B C C
  2732. 0 1 10 10
  2733. 1 2 8 9
  2734. 2 3 6 8
  2735. 3 4 4 7
  2736. 4 5 2 6
  2737. >>> df.query('A > B')
  2738. A B C C
  2739. 4 5 2 6
  2740. The previous expression is equivalent to
  2741. >>> df[df.A > df.B]
  2742. A B C C
  2743. 4 5 2 6
  2744. For columns with spaces in their name, you can use backtick quoting.
  2745. >>> df.query('B == `C C`')
  2746. A B C C
  2747. 0 1 10 10
  2748. The previous expression is equivalent to
  2749. >>> df[df.B == df['C C']]
  2750. A B C C
  2751. 0 1 10 10
  2752. """
  2753. inplace = validate_bool_kwarg(inplace, "inplace")
  2754. if not isinstance(expr, str):
  2755. msg = f"expr must be a string to be evaluated, {type(expr)} given"
  2756. raise ValueError(msg)
  2757. kwargs["level"] = kwargs.pop("level", 0) + 1
  2758. kwargs["target"] = None
  2759. res = self.eval(expr, **kwargs)
  2760. try:
  2761. new_data = self.loc[res]
  2762. except ValueError:
  2763. # when res is multi-dimensional loc raises, but this is sometimes a
  2764. # valid query
  2765. new_data = self[res]
  2766. if inplace:
  2767. self._update_inplace(new_data)
  2768. else:
  2769. return new_data
  2770. def eval(self, expr, inplace=False, **kwargs):
  2771. """
  2772. Evaluate a string describing operations on DataFrame columns.
  2773. Operates on columns only, not specific rows or elements. This allows
  2774. `eval` to run arbitrary code, which can make you vulnerable to code
  2775. injection if you pass user input to this function.
  2776. Parameters
  2777. ----------
  2778. expr : str
  2779. The expression string to evaluate.
  2780. inplace : bool, default False
  2781. If the expression contains an assignment, whether to perform the
  2782. operation inplace and mutate the existing DataFrame. Otherwise,
  2783. a new DataFrame is returned.
  2784. **kwargs
  2785. See the documentation for :func:`eval` for complete details
  2786. on the keyword arguments accepted by
  2787. :meth:`~pandas.DataFrame.query`.
  2788. Returns
  2789. -------
  2790. ndarray, scalar, or pandas object
  2791. The result of the evaluation.
  2792. See Also
  2793. --------
  2794. DataFrame.query : Evaluates a boolean expression to query the columns
  2795. of a frame.
  2796. DataFrame.assign : Can evaluate an expression or function to create new
  2797. values for a column.
  2798. eval : Evaluate a Python expression as a string using various
  2799. backends.
  2800. Notes
  2801. -----
  2802. For more details see the API documentation for :func:`~eval`.
  2803. For detailed examples see :ref:`enhancing performance with eval
  2804. <enhancingperf.eval>`.
  2805. Examples
  2806. --------
  2807. >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
  2808. >>> df
  2809. A B
  2810. 0 1 10
  2811. 1 2 8
  2812. 2 3 6
  2813. 3 4 4
  2814. 4 5 2
  2815. >>> df.eval('A + B')
  2816. 0 11
  2817. 1 10
  2818. 2 9
  2819. 3 8
  2820. 4 7
  2821. dtype: int64
  2822. Assignment is allowed though by default the original DataFrame is not
  2823. modified.
  2824. >>> df.eval('C = A + B')
  2825. A B C
  2826. 0 1 10 11
  2827. 1 2 8 10
  2828. 2 3 6 9
  2829. 3 4 4 8
  2830. 4 5 2 7
  2831. >>> df
  2832. A B
  2833. 0 1 10
  2834. 1 2 8
  2835. 2 3 6
  2836. 3 4 4
  2837. 4 5 2
  2838. Use ``inplace=True`` to modify the original DataFrame.
  2839. >>> df.eval('C = A + B', inplace=True)
  2840. >>> df
  2841. A B C
  2842. 0 1 10 11
  2843. 1 2 8 10
  2844. 2 3 6 9
  2845. 3 4 4 8
  2846. 4 5 2 7
  2847. """
  2848. from pandas.core.computation.eval import eval as _eval
  2849. inplace = validate_bool_kwarg(inplace, "inplace")
  2850. resolvers = kwargs.pop("resolvers", None)
  2851. kwargs["level"] = kwargs.pop("level", 0) + 1
  2852. if resolvers is None:
  2853. index_resolvers = self._get_index_resolvers()
  2854. column_resolvers = self._get_cleaned_column_resolvers()
  2855. resolvers = column_resolvers, index_resolvers
  2856. if "target" not in kwargs:
  2857. kwargs["target"] = self
  2858. kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
  2859. return _eval(expr, inplace=inplace, **kwargs)
  2860. def select_dtypes(self, include=None, exclude=None) -> "DataFrame":
  2861. """
  2862. Return a subset of the DataFrame's columns based on the column dtypes.
  2863. Parameters
  2864. ----------
  2865. include, exclude : scalar or list-like
  2866. A selection of dtypes or strings to be included/excluded. At least
  2867. one of these parameters must be supplied.
  2868. Returns
  2869. -------
  2870. DataFrame
  2871. The subset of the frame including the dtypes in ``include`` and
  2872. excluding the dtypes in ``exclude``.
  2873. Raises
  2874. ------
  2875. ValueError
  2876. * If both of ``include`` and ``exclude`` are empty
  2877. * If ``include`` and ``exclude`` have overlapping elements
  2878. * If any kind of string dtype is passed in.
  2879. Notes
  2880. -----
  2881. * To select all *numeric* types, use ``np.number`` or ``'number'``
  2882. * To select strings you must use the ``object`` dtype, but note that
  2883. this will return *all* object dtype columns
  2884. * See the `numpy dtype hierarchy
  2885. <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
  2886. * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
  2887. ``'datetime64'``
  2888. * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
  2889. ``'timedelta64'``
  2890. * To select Pandas categorical dtypes, use ``'category'``
  2891. * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
  2892. 0.20.0) or ``'datetime64[ns, tz]'``
  2893. Examples
  2894. --------
  2895. >>> df = pd.DataFrame({'a': [1, 2] * 3,
  2896. ... 'b': [True, False] * 3,
  2897. ... 'c': [1.0, 2.0] * 3})
  2898. >>> df
  2899. a b c
  2900. 0 1 True 1.0
  2901. 1 2 False 2.0
  2902. 2 1 True 1.0
  2903. 3 2 False 2.0
  2904. 4 1 True 1.0
  2905. 5 2 False 2.0
  2906. >>> df.select_dtypes(include='bool')
  2907. b
  2908. 0 True
  2909. 1 False
  2910. 2 True
  2911. 3 False
  2912. 4 True
  2913. 5 False
  2914. >>> df.select_dtypes(include=['float64'])
  2915. c
  2916. 0 1.0
  2917. 1 2.0
  2918. 2 1.0
  2919. 3 2.0
  2920. 4 1.0
  2921. 5 2.0
  2922. >>> df.select_dtypes(exclude=['int'])
  2923. b c
  2924. 0 True 1.0
  2925. 1 False 2.0
  2926. 2 True 1.0
  2927. 3 False 2.0
  2928. 4 True 1.0
  2929. 5 False 2.0
  2930. """
  2931. if not is_list_like(include):
  2932. include = (include,) if include is not None else ()
  2933. if not is_list_like(exclude):
  2934. exclude = (exclude,) if exclude is not None else ()
  2935. selection = (frozenset(include), frozenset(exclude))
  2936. if not any(selection):
  2937. raise ValueError("at least one of include or exclude must be nonempty")
  2938. # convert the myriad valid dtypes object to a single representation
  2939. include = frozenset(infer_dtype_from_object(x) for x in include)
  2940. exclude = frozenset(infer_dtype_from_object(x) for x in exclude)
  2941. for dtypes in (include, exclude):
  2942. invalidate_string_dtypes(dtypes)
  2943. # can't both include AND exclude!
  2944. if not include.isdisjoint(exclude):
  2945. raise ValueError(f"include and exclude overlap on {(include & exclude)}")
  2946. # We raise when both include and exclude are empty
  2947. # Hence, we can just shrink the columns we want to keep
  2948. keep_these = np.full(self.shape[1], True)
  2949. def extract_unique_dtypes_from_dtypes_set(
  2950. dtypes_set: FrozenSet[Dtype], unique_dtypes: np.ndarray
  2951. ) -> List[Dtype]:
  2952. extracted_dtypes = [
  2953. unique_dtype
  2954. for unique_dtype in unique_dtypes
  2955. if issubclass(unique_dtype.type, tuple(dtypes_set)) # type: ignore
  2956. ]
  2957. return extracted_dtypes
  2958. unique_dtypes = self.dtypes.unique()
  2959. if include:
  2960. included_dtypes = extract_unique_dtypes_from_dtypes_set(
  2961. include, unique_dtypes
  2962. )
  2963. keep_these &= self.dtypes.isin(included_dtypes)
  2964. if exclude:
  2965. excluded_dtypes = extract_unique_dtypes_from_dtypes_set(
  2966. exclude, unique_dtypes
  2967. )
  2968. keep_these &= ~self.dtypes.isin(excluded_dtypes)
  2969. return self.iloc[:, keep_these.values]
  2970. def insert(self, loc, column, value, allow_duplicates=False) -> None:
  2971. """
  2972. Insert column into DataFrame at specified location.
  2973. Raises a ValueError if `column` is already contained in the DataFrame,
  2974. unless `allow_duplicates` is set to True.
  2975. Parameters
  2976. ----------
  2977. loc : int
  2978. Insertion index. Must verify 0 <= loc <= len(columns).
  2979. column : str, number, or hashable object
  2980. Label of the inserted column.
  2981. value : int, Series, or array-like
  2982. allow_duplicates : bool, optional
  2983. """
  2984. self._ensure_valid_index(value)
  2985. value = self._sanitize_column(column, value, broadcast=False)
  2986. self._data.insert(loc, column, value, allow_duplicates=allow_duplicates)
  2987. def assign(self, **kwargs) -> "DataFrame":
  2988. r"""
  2989. Assign new columns to a DataFrame.
  2990. Returns a new object with all original columns in addition to new ones.
  2991. Existing columns that are re-assigned will be overwritten.
  2992. Parameters
  2993. ----------
  2994. **kwargs : dict of {str: callable or Series}
  2995. The column names are keywords. If the values are
  2996. callable, they are computed on the DataFrame and
  2997. assigned to the new columns. The callable must not
  2998. change input DataFrame (though pandas doesn't check it).
  2999. If the values are not callable, (e.g. a Series, scalar, or array),
  3000. they are simply assigned.
  3001. Returns
  3002. -------
  3003. DataFrame
  3004. A new DataFrame with the new columns in addition to
  3005. all the existing columns.
  3006. Notes
  3007. -----
  3008. Assigning multiple columns within the same ``assign`` is possible.
  3009. Later items in '\*\*kwargs' may refer to newly created or modified
  3010. columns in 'df'; items are computed and assigned into 'df' in order.
  3011. .. versionchanged:: 0.23.0
  3012. Keyword argument order is maintained.
  3013. Examples
  3014. --------
  3015. >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
  3016. ... index=['Portland', 'Berkeley'])
  3017. >>> df
  3018. temp_c
  3019. Portland 17.0
  3020. Berkeley 25.0
  3021. Where the value is a callable, evaluated on `df`:
  3022. >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
  3023. temp_c temp_f
  3024. Portland 17.0 62.6
  3025. Berkeley 25.0 77.0
  3026. Alternatively, the same behavior can be achieved by directly
  3027. referencing an existing Series or sequence:
  3028. >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
  3029. temp_c temp_f
  3030. Portland 17.0 62.6
  3031. Berkeley 25.0 77.0
  3032. You can create multiple columns within the same assign where one
  3033. of the columns depends on another one defined within the same assign:
  3034. >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
  3035. ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
  3036. temp_c temp_f temp_k
  3037. Portland 17.0 62.6 290.15
  3038. Berkeley 25.0 77.0 298.15
  3039. """
  3040. data = self.copy()
  3041. for k, v in kwargs.items():
  3042. data[k] = com.apply_if_callable(v, data)
  3043. return data
  3044. def _sanitize_column(self, key, value, broadcast=True):
  3045. """
  3046. Ensures new columns (which go into the BlockManager as new blocks) are
  3047. always copied and converted into an array.
  3048. Parameters
  3049. ----------
  3050. key : object
  3051. value : scalar, Series, or array-like
  3052. broadcast : bool, default True
  3053. If ``key`` matches multiple duplicate column names in the
  3054. DataFrame, this parameter indicates whether ``value`` should be
  3055. tiled so that the returned array contains a (duplicated) column for
  3056. each occurrence of the key. If False, ``value`` will not be tiled.
  3057. Returns
  3058. -------
  3059. numpy.ndarray
  3060. """
  3061. def reindexer(value):
  3062. # reindex if necessary
  3063. if value.index.equals(self.index) or not len(self.index):
  3064. value = value._values.copy()
  3065. else:
  3066. # GH 4107
  3067. try:
  3068. value = value.reindex(self.index)._values
  3069. except ValueError as err:
  3070. # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
  3071. if not value.index.is_unique:
  3072. # duplicate axis
  3073. raise err
  3074. # other
  3075. raise TypeError(
  3076. "incompatible index of inserted column with frame index"
  3077. )
  3078. return value
  3079. if isinstance(value, Series):
  3080. value = reindexer(value)
  3081. elif isinstance(value, DataFrame):
  3082. # align right-hand-side columns if self.columns
  3083. # is multi-index and self[key] is a sub-frame
  3084. if isinstance(self.columns, ABCMultiIndex) and key in self.columns:
  3085. loc = self.columns.get_loc(key)
  3086. if isinstance(loc, (slice, Series, np.ndarray, Index)):
  3087. cols = maybe_droplevels(self.columns[loc], key)
  3088. if len(cols) and not cols.equals(value.columns):
  3089. value = value.reindex(cols, axis=1)
  3090. # now align rows
  3091. value = reindexer(value).T
  3092. elif isinstance(value, ExtensionArray):
  3093. # Explicitly copy here, instead of in sanitize_index,
  3094. # as sanitize_index won't copy an EA, even with copy=True
  3095. value = value.copy()
  3096. value = sanitize_index(value, self.index, copy=False)
  3097. elif isinstance(value, Index) or is_sequence(value):
  3098. # turn me into an ndarray
  3099. value = sanitize_index(value, self.index, copy=False)
  3100. if not isinstance(value, (np.ndarray, Index)):
  3101. if isinstance(value, list) and len(value) > 0:
  3102. value = maybe_convert_platform(value)
  3103. else:
  3104. value = com.asarray_tuplesafe(value)
  3105. elif value.ndim == 2:
  3106. value = value.copy().T
  3107. elif isinstance(value, Index):
  3108. value = value.copy(deep=True)
  3109. else:
  3110. value = value.copy()
  3111. # possibly infer to datetimelike
  3112. if is_object_dtype(value.dtype):
  3113. value = maybe_infer_to_datetimelike(value)
  3114. else:
  3115. # cast ignores pandas dtypes. so save the dtype first
  3116. infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True)
  3117. # upcast
  3118. value = cast_scalar_to_array(len(self.index), value)
  3119. value = maybe_cast_to_datetime(value, infer_dtype)
  3120. # return internal types directly
  3121. if is_extension_array_dtype(value):
  3122. return value
  3123. # broadcast across multiple columns if necessary
  3124. if broadcast and key in self.columns and value.ndim == 1:
  3125. if not self.columns.is_unique or isinstance(self.columns, ABCMultiIndex):
  3126. existing_piece = self[key]
  3127. if isinstance(existing_piece, DataFrame):
  3128. value = np.tile(value, (len(existing_piece.columns), 1))
  3129. return np.atleast_2d(np.asarray(value))
  3130. @property
  3131. def _series(self):
  3132. return {
  3133. item: Series(self._data.iget(idx), index=self.index, name=item)
  3134. for idx, item in enumerate(self.columns)
  3135. }
  3136. def lookup(self, row_labels, col_labels) -> np.ndarray:
  3137. """
  3138. Label-based "fancy indexing" function for DataFrame.
  3139. Given equal-length arrays of row and column labels, return an
  3140. array of the values corresponding to each (row, col) pair.
  3141. Parameters
  3142. ----------
  3143. row_labels : sequence
  3144. The row labels to use for lookup.
  3145. col_labels : sequence
  3146. The column labels to use for lookup.
  3147. Returns
  3148. -------
  3149. numpy.ndarray
  3150. Examples
  3151. --------
  3152. values : ndarray
  3153. The found values
  3154. """
  3155. n = len(row_labels)
  3156. if n != len(col_labels):
  3157. raise ValueError("Row labels must have same size as column labels")
  3158. thresh = 1000
  3159. if not self._is_mixed_type or n > thresh:
  3160. values = self.values
  3161. ridx = self.index.get_indexer(row_labels)
  3162. cidx = self.columns.get_indexer(col_labels)
  3163. if (ridx == -1).any():
  3164. raise KeyError("One or more row labels was not found")
  3165. if (cidx == -1).any():
  3166. raise KeyError("One or more column labels was not found")
  3167. flat_index = ridx * len(self.columns) + cidx
  3168. result = values.flat[flat_index]
  3169. else:
  3170. result = np.empty(n, dtype="O")
  3171. for i, (r, c) in enumerate(zip(row_labels, col_labels)):
  3172. result[i] = self._get_value(r, c)
  3173. if is_object_dtype(result):
  3174. result = lib.maybe_convert_objects(result)
  3175. return result
  3176. # ----------------------------------------------------------------------
  3177. # Reindexing and alignment
  3178. def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
  3179. frame = self
  3180. columns = axes["columns"]
  3181. if columns is not None:
  3182. frame = frame._reindex_columns(
  3183. columns, method, copy, level, fill_value, limit, tolerance
  3184. )
  3185. index = axes["index"]
  3186. if index is not None:
  3187. frame = frame._reindex_index(
  3188. index, method, copy, level, fill_value, limit, tolerance
  3189. )
  3190. return frame
  3191. def _reindex_index(
  3192. self,
  3193. new_index,
  3194. method,
  3195. copy,
  3196. level,
  3197. fill_value=np.nan,
  3198. limit=None,
  3199. tolerance=None,
  3200. ):
  3201. new_index, indexer = self.index.reindex(
  3202. new_index, method=method, level=level, limit=limit, tolerance=tolerance
  3203. )
  3204. return self._reindex_with_indexers(
  3205. {0: [new_index, indexer]},
  3206. copy=copy,
  3207. fill_value=fill_value,
  3208. allow_dups=False,
  3209. )
  3210. def _reindex_columns(
  3211. self,
  3212. new_columns,
  3213. method,
  3214. copy,
  3215. level,
  3216. fill_value=None,
  3217. limit=None,
  3218. tolerance=None,
  3219. ):
  3220. new_columns, indexer = self.columns.reindex(
  3221. new_columns, method=method, level=level, limit=limit, tolerance=tolerance
  3222. )
  3223. return self._reindex_with_indexers(
  3224. {1: [new_columns, indexer]},
  3225. copy=copy,
  3226. fill_value=fill_value,
  3227. allow_dups=False,
  3228. )
  3229. def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame":
  3230. """
  3231. We are guaranteed non-Nones in the axes.
  3232. """
  3233. new_index, row_indexer = self.index.reindex(axes["index"])
  3234. new_columns, col_indexer = self.columns.reindex(axes["columns"])
  3235. if row_indexer is not None and col_indexer is not None:
  3236. indexer = row_indexer, col_indexer
  3237. new_values = algorithms.take_2d_multi(
  3238. self.values, indexer, fill_value=fill_value
  3239. )
  3240. return self._constructor(new_values, index=new_index, columns=new_columns)
  3241. else:
  3242. return self._reindex_with_indexers(
  3243. {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
  3244. copy=copy,
  3245. fill_value=fill_value,
  3246. )
  3247. @Appender(_shared_docs["align"] % _shared_doc_kwargs)
  3248. def align(
  3249. self,
  3250. other,
  3251. join="outer",
  3252. axis=None,
  3253. level=None,
  3254. copy=True,
  3255. fill_value=None,
  3256. method=None,
  3257. limit=None,
  3258. fill_axis=0,
  3259. broadcast_axis=None,
  3260. ) -> "DataFrame":
  3261. return super().align(
  3262. other,
  3263. join=join,
  3264. axis=axis,
  3265. level=level,
  3266. copy=copy,
  3267. fill_value=fill_value,
  3268. method=method,
  3269. limit=limit,
  3270. fill_axis=fill_axis,
  3271. broadcast_axis=broadcast_axis,
  3272. )
  3273. @Substitution(**_shared_doc_kwargs)
  3274. @Appender(NDFrame.reindex.__doc__)
  3275. @rewrite_axis_style_signature(
  3276. "labels",
  3277. [
  3278. ("method", None),
  3279. ("copy", True),
  3280. ("level", None),
  3281. ("fill_value", np.nan),
  3282. ("limit", None),
  3283. ("tolerance", None),
  3284. ],
  3285. )
  3286. def reindex(self, *args, **kwargs) -> "DataFrame":
  3287. axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex")
  3288. kwargs.update(axes)
  3289. # Pop these, since the values are in `kwargs` under different names
  3290. kwargs.pop("axis", None)
  3291. kwargs.pop("labels", None)
  3292. return super().reindex(**kwargs)
  3293. def drop(
  3294. self,
  3295. labels=None,
  3296. axis=0,
  3297. index=None,
  3298. columns=None,
  3299. level=None,
  3300. inplace=False,
  3301. errors="raise",
  3302. ):
  3303. """
  3304. Drop specified labels from rows or columns.
  3305. Remove rows or columns by specifying label names and corresponding
  3306. axis, or by specifying directly index or column names. When using a
  3307. multi-index, labels on different levels can be removed by specifying
  3308. the level.
  3309. Parameters
  3310. ----------
  3311. labels : single label or list-like
  3312. Index or column labels to drop.
  3313. axis : {0 or 'index', 1 or 'columns'}, default 0
  3314. Whether to drop labels from the index (0 or 'index') or
  3315. columns (1 or 'columns').
  3316. index : single label or list-like
  3317. Alternative to specifying axis (``labels, axis=0``
  3318. is equivalent to ``index=labels``).
  3319. .. versionadded:: 0.21.0
  3320. columns : single label or list-like
  3321. Alternative to specifying axis (``labels, axis=1``
  3322. is equivalent to ``columns=labels``).
  3323. .. versionadded:: 0.21.0
  3324. level : int or level name, optional
  3325. For MultiIndex, level from which the labels will be removed.
  3326. inplace : bool, default False
  3327. If True, do operation inplace and return None.
  3328. errors : {'ignore', 'raise'}, default 'raise'
  3329. If 'ignore', suppress error and only existing labels are
  3330. dropped.
  3331. Returns
  3332. -------
  3333. DataFrame
  3334. DataFrame without the removed index or column labels.
  3335. Raises
  3336. ------
  3337. KeyError
  3338. If any of the labels is not found in the selected axis.
  3339. See Also
  3340. --------
  3341. DataFrame.loc : Label-location based indexer for selection by label.
  3342. DataFrame.dropna : Return DataFrame with labels on given axis omitted
  3343. where (all or any) data are missing.
  3344. DataFrame.drop_duplicates : Return DataFrame with duplicate rows
  3345. removed, optionally only considering certain columns.
  3346. Series.drop : Return Series with specified index labels removed.
  3347. Examples
  3348. --------
  3349. >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
  3350. ... columns=['A', 'B', 'C', 'D'])
  3351. >>> df
  3352. A B C D
  3353. 0 0 1 2 3
  3354. 1 4 5 6 7
  3355. 2 8 9 10 11
  3356. Drop columns
  3357. >>> df.drop(['B', 'C'], axis=1)
  3358. A D
  3359. 0 0 3
  3360. 1 4 7
  3361. 2 8 11
  3362. >>> df.drop(columns=['B', 'C'])
  3363. A D
  3364. 0 0 3
  3365. 1 4 7
  3366. 2 8 11
  3367. Drop a row by index
  3368. >>> df.drop([0, 1])
  3369. A B C D
  3370. 2 8 9 10 11
  3371. Drop columns and/or rows of MultiIndex DataFrame
  3372. >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
  3373. ... ['speed', 'weight', 'length']],
  3374. ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
  3375. ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
  3376. >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
  3377. ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
  3378. ... [250, 150], [1.5, 0.8], [320, 250],
  3379. ... [1, 0.8], [0.3, 0.2]])
  3380. >>> df
  3381. big small
  3382. lama speed 45.0 30.0
  3383. weight 200.0 100.0
  3384. length 1.5 1.0
  3385. cow speed 30.0 20.0
  3386. weight 250.0 150.0
  3387. length 1.5 0.8
  3388. falcon speed 320.0 250.0
  3389. weight 1.0 0.8
  3390. length 0.3 0.2
  3391. >>> df.drop(index='cow', columns='small')
  3392. big
  3393. lama speed 45.0
  3394. weight 200.0
  3395. length 1.5
  3396. falcon speed 320.0
  3397. weight 1.0
  3398. length 0.3
  3399. >>> df.drop(index='length', level=1)
  3400. big small
  3401. lama speed 45.0 30.0
  3402. weight 200.0 100.0
  3403. cow speed 30.0 20.0
  3404. weight 250.0 150.0
  3405. falcon speed 320.0 250.0
  3406. weight 1.0 0.8
  3407. """
  3408. return super().drop(
  3409. labels=labels,
  3410. axis=axis,
  3411. index=index,
  3412. columns=columns,
  3413. level=level,
  3414. inplace=inplace,
  3415. errors=errors,
  3416. )
  3417. @rewrite_axis_style_signature(
  3418. "mapper",
  3419. [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")],
  3420. )
  3421. def rename(
  3422. self,
  3423. mapper: Optional[Renamer] = None,
  3424. *,
  3425. index: Optional[Renamer] = None,
  3426. columns: Optional[Renamer] = None,
  3427. axis: Optional[Axis] = None,
  3428. copy: bool = True,
  3429. inplace: bool = False,
  3430. level: Optional[Level] = None,
  3431. errors: str = "ignore",
  3432. ) -> Optional["DataFrame"]:
  3433. """
  3434. Alter axes labels.
  3435. Function / dict values must be unique (1-to-1). Labels not contained in
  3436. a dict / Series will be left as-is. Extra labels listed don't throw an
  3437. error.
  3438. See the :ref:`user guide <basics.rename>` for more.
  3439. Parameters
  3440. ----------
  3441. mapper : dict-like or function
  3442. Dict-like or functions transformations to apply to
  3443. that axis' values. Use either ``mapper`` and ``axis`` to
  3444. specify the axis to target with ``mapper``, or ``index`` and
  3445. ``columns``.
  3446. index : dict-like or function
  3447. Alternative to specifying axis (``mapper, axis=0``
  3448. is equivalent to ``index=mapper``).
  3449. columns : dict-like or function
  3450. Alternative to specifying axis (``mapper, axis=1``
  3451. is equivalent to ``columns=mapper``).
  3452. axis : int or str
  3453. Axis to target with ``mapper``. Can be either the axis name
  3454. ('index', 'columns') or number (0, 1). The default is 'index'.
  3455. copy : bool, default True
  3456. Also copy underlying data.
  3457. inplace : bool, default False
  3458. Whether to return a new DataFrame. If True then value of copy is
  3459. ignored.
  3460. level : int or level name, default None
  3461. In case of a MultiIndex, only rename labels in the specified
  3462. level.
  3463. errors : {'ignore', 'raise'}, default 'ignore'
  3464. If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
  3465. or `columns` contains labels that are not present in the Index
  3466. being transformed.
  3467. If 'ignore', existing keys will be renamed and extra keys will be
  3468. ignored.
  3469. Returns
  3470. -------
  3471. DataFrame
  3472. DataFrame with the renamed axis labels.
  3473. Raises
  3474. ------
  3475. KeyError
  3476. If any of the labels is not found in the selected axis and
  3477. "errors='raise'".
  3478. See Also
  3479. --------
  3480. DataFrame.rename_axis : Set the name of the axis.
  3481. Examples
  3482. --------
  3483. ``DataFrame.rename`` supports two calling conventions
  3484. * ``(index=index_mapper, columns=columns_mapper, ...)``
  3485. * ``(mapper, axis={'index', 'columns'}, ...)``
  3486. We *highly* recommend using keyword arguments to clarify your
  3487. intent.
  3488. Rename columns using a mapping:
  3489. >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  3490. >>> df.rename(columns={"A": "a", "B": "c"})
  3491. a c
  3492. 0 1 4
  3493. 1 2 5
  3494. 2 3 6
  3495. Rename index using a mapping:
  3496. >>> df.rename(index={0: "x", 1: "y", 2: "z"})
  3497. A B
  3498. x 1 4
  3499. y 2 5
  3500. z 3 6
  3501. Cast index labels to a different type:
  3502. >>> df.index
  3503. RangeIndex(start=0, stop=3, step=1)
  3504. >>> df.rename(index=str).index
  3505. Index(['0', '1', '2'], dtype='object')
  3506. >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
  3507. Traceback (most recent call last):
  3508. KeyError: ['C'] not found in axis
  3509. Using axis-style parameters
  3510. >>> df.rename(str.lower, axis='columns')
  3511. a b
  3512. 0 1 4
  3513. 1 2 5
  3514. 2 3 6
  3515. >>> df.rename({1: 2, 2: 4}, axis='index')
  3516. A B
  3517. 0 1 4
  3518. 2 2 5
  3519. 4 3 6
  3520. """
  3521. return super().rename(
  3522. mapper=mapper,
  3523. index=index,
  3524. columns=columns,
  3525. axis=axis,
  3526. copy=copy,
  3527. inplace=inplace,
  3528. level=level,
  3529. errors=errors,
  3530. )
  3531. @Substitution(**_shared_doc_kwargs)
  3532. @Appender(NDFrame.fillna.__doc__)
  3533. def fillna(
  3534. self,
  3535. value=None,
  3536. method=None,
  3537. axis=None,
  3538. inplace=False,
  3539. limit=None,
  3540. downcast=None,
  3541. ) -> Optional["DataFrame"]:
  3542. return super().fillna(
  3543. value=value,
  3544. method=method,
  3545. axis=axis,
  3546. inplace=inplace,
  3547. limit=limit,
  3548. downcast=downcast,
  3549. )
  3550. @Appender(_shared_docs["replace"] % _shared_doc_kwargs)
  3551. def replace(
  3552. self,
  3553. to_replace=None,
  3554. value=None,
  3555. inplace=False,
  3556. limit=None,
  3557. regex=False,
  3558. method="pad",
  3559. ):
  3560. return super().replace(
  3561. to_replace=to_replace,
  3562. value=value,
  3563. inplace=inplace,
  3564. limit=limit,
  3565. regex=regex,
  3566. method=method,
  3567. )
  3568. @Appender(_shared_docs["shift"] % _shared_doc_kwargs)
  3569. def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame":
  3570. return super().shift(
  3571. periods=periods, freq=freq, axis=axis, fill_value=fill_value
  3572. )
  3573. def set_index(
  3574. self, keys, drop=True, append=False, inplace=False, verify_integrity=False
  3575. ):
  3576. """
  3577. Set the DataFrame index using existing columns.
  3578. Set the DataFrame index (row labels) using one or more existing
  3579. columns or arrays (of the correct length). The index can replace the
  3580. existing index or expand on it.
  3581. Parameters
  3582. ----------
  3583. keys : label or array-like or list of labels/arrays
  3584. This parameter can be either a single column key, a single array of
  3585. the same length as the calling DataFrame, or a list containing an
  3586. arbitrary combination of column keys and arrays. Here, "array"
  3587. encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
  3588. instances of :class:`~collections.abc.Iterator`.
  3589. drop : bool, default True
  3590. Delete columns to be used as the new index.
  3591. append : bool, default False
  3592. Whether to append columns to existing index.
  3593. inplace : bool, default False
  3594. Modify the DataFrame in place (do not create a new object).
  3595. verify_integrity : bool, default False
  3596. Check the new index for duplicates. Otherwise defer the check until
  3597. necessary. Setting to False will improve the performance of this
  3598. method.
  3599. Returns
  3600. -------
  3601. DataFrame
  3602. Changed row labels.
  3603. See Also
  3604. --------
  3605. DataFrame.reset_index : Opposite of set_index.
  3606. DataFrame.reindex : Change to new indices or expand indices.
  3607. DataFrame.reindex_like : Change to same indices as other DataFrame.
  3608. Examples
  3609. --------
  3610. >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
  3611. ... 'year': [2012, 2014, 2013, 2014],
  3612. ... 'sale': [55, 40, 84, 31]})
  3613. >>> df
  3614. month year sale
  3615. 0 1 2012 55
  3616. 1 4 2014 40
  3617. 2 7 2013 84
  3618. 3 10 2014 31
  3619. Set the index to become the 'month' column:
  3620. >>> df.set_index('month')
  3621. year sale
  3622. month
  3623. 1 2012 55
  3624. 4 2014 40
  3625. 7 2013 84
  3626. 10 2014 31
  3627. Create a MultiIndex using columns 'year' and 'month':
  3628. >>> df.set_index(['year', 'month'])
  3629. sale
  3630. year month
  3631. 2012 1 55
  3632. 2014 4 40
  3633. 2013 7 84
  3634. 2014 10 31
  3635. Create a MultiIndex using an Index and a column:
  3636. >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
  3637. month sale
  3638. year
  3639. 1 2012 1 55
  3640. 2 2014 4 40
  3641. 3 2013 7 84
  3642. 4 2014 10 31
  3643. Create a MultiIndex using two Series:
  3644. >>> s = pd.Series([1, 2, 3, 4])
  3645. >>> df.set_index([s, s**2])
  3646. month year sale
  3647. 1 1 1 2012 55
  3648. 2 4 4 2014 40
  3649. 3 9 7 2013 84
  3650. 4 16 10 2014 31
  3651. """
  3652. inplace = validate_bool_kwarg(inplace, "inplace")
  3653. if not isinstance(keys, list):
  3654. keys = [keys]
  3655. err_msg = (
  3656. 'The parameter "keys" may be a column key, one-dimensional '
  3657. "array, or a list containing only valid column keys and "
  3658. "one-dimensional arrays."
  3659. )
  3660. missing: List[Optional[Hashable]] = []
  3661. for col in keys:
  3662. if isinstance(
  3663. col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator)
  3664. ):
  3665. # arrays are fine as long as they are one-dimensional
  3666. # iterators get converted to list below
  3667. if getattr(col, "ndim", 1) != 1:
  3668. raise ValueError(err_msg)
  3669. else:
  3670. # everything else gets tried as a key; see GH 24969
  3671. try:
  3672. found = col in self.columns
  3673. except TypeError:
  3674. raise TypeError(f"{err_msg}. Received column of type {type(col)}")
  3675. else:
  3676. if not found:
  3677. missing.append(col)
  3678. if missing:
  3679. raise KeyError(f"None of {missing} are in the columns")
  3680. if inplace:
  3681. frame = self
  3682. else:
  3683. frame = self.copy()
  3684. arrays = []
  3685. names = []
  3686. if append:
  3687. names = list(self.index.names)
  3688. if isinstance(self.index, ABCMultiIndex):
  3689. for i in range(self.index.nlevels):
  3690. arrays.append(self.index._get_level_values(i))
  3691. else:
  3692. arrays.append(self.index)
  3693. to_remove: List[Optional[Hashable]] = []
  3694. for col in keys:
  3695. if isinstance(col, ABCMultiIndex):
  3696. for n in range(col.nlevels):
  3697. arrays.append(col._get_level_values(n))
  3698. names.extend(col.names)
  3699. elif isinstance(col, (ABCIndexClass, ABCSeries)):
  3700. # if Index then not MultiIndex (treated above)
  3701. arrays.append(col)
  3702. names.append(col.name)
  3703. elif isinstance(col, (list, np.ndarray)):
  3704. arrays.append(col)
  3705. names.append(None)
  3706. elif isinstance(col, abc.Iterator):
  3707. arrays.append(list(col))
  3708. names.append(None)
  3709. # from here, col can only be a column label
  3710. else:
  3711. arrays.append(frame[col]._values)
  3712. names.append(col)
  3713. if drop:
  3714. to_remove.append(col)
  3715. if len(arrays[-1]) != len(self):
  3716. # check newest element against length of calling frame, since
  3717. # ensure_index_from_sequences would not raise for append=False.
  3718. raise ValueError(
  3719. f"Length mismatch: Expected {len(self)} rows, "
  3720. f"received array of length {len(arrays[-1])}"
  3721. )
  3722. index = ensure_index_from_sequences(arrays, names)
  3723. if verify_integrity and not index.is_unique:
  3724. duplicates = index[index.duplicated()].unique()
  3725. raise ValueError(f"Index has duplicate keys: {duplicates}")
  3726. # use set to handle duplicate column names gracefully in case of drop
  3727. for c in set(to_remove):
  3728. del frame[c]
  3729. # clear up memory usage
  3730. index._cleanup()
  3731. frame.index = index
  3732. if not inplace:
  3733. return frame
  3734. def reset_index(
  3735. self,
  3736. level: Optional[Union[Hashable, Sequence[Hashable]]] = None,
  3737. drop: bool = False,
  3738. inplace: bool = False,
  3739. col_level: Hashable = 0,
  3740. col_fill: Optional[Hashable] = "",
  3741. ) -> Optional["DataFrame"]:
  3742. """
  3743. Reset the index, or a level of it.
  3744. Reset the index of the DataFrame, and use the default one instead.
  3745. If the DataFrame has a MultiIndex, this method can remove one or more
  3746. levels.
  3747. Parameters
  3748. ----------
  3749. level : int, str, tuple, or list, default None
  3750. Only remove the given levels from the index. Removes all levels by
  3751. default.
  3752. drop : bool, default False
  3753. Do not try to insert index into dataframe columns. This resets
  3754. the index to the default integer index.
  3755. inplace : bool, default False
  3756. Modify the DataFrame in place (do not create a new object).
  3757. col_level : int or str, default 0
  3758. If the columns have multiple levels, determines which level the
  3759. labels are inserted into. By default it is inserted into the first
  3760. level.
  3761. col_fill : object, default ''
  3762. If the columns have multiple levels, determines how the other
  3763. levels are named. If None then the index name is repeated.
  3764. Returns
  3765. -------
  3766. DataFrame or None
  3767. DataFrame with the new index or None if ``inplace=True``.
  3768. See Also
  3769. --------
  3770. DataFrame.set_index : Opposite of reset_index.
  3771. DataFrame.reindex : Change to new indices or expand indices.
  3772. DataFrame.reindex_like : Change to same indices as other DataFrame.
  3773. Examples
  3774. --------
  3775. >>> df = pd.DataFrame([('bird', 389.0),
  3776. ... ('bird', 24.0),
  3777. ... ('mammal', 80.5),
  3778. ... ('mammal', np.nan)],
  3779. ... index=['falcon', 'parrot', 'lion', 'monkey'],
  3780. ... columns=('class', 'max_speed'))
  3781. >>> df
  3782. class max_speed
  3783. falcon bird 389.0
  3784. parrot bird 24.0
  3785. lion mammal 80.5
  3786. monkey mammal NaN
  3787. When we reset the index, the old index is added as a column, and a
  3788. new sequential index is used:
  3789. >>> df.reset_index()
  3790. index class max_speed
  3791. 0 falcon bird 389.0
  3792. 1 parrot bird 24.0
  3793. 2 lion mammal 80.5
  3794. 3 monkey mammal NaN
  3795. We can use the `drop` parameter to avoid the old index being added as
  3796. a column:
  3797. >>> df.reset_index(drop=True)
  3798. class max_speed
  3799. 0 bird 389.0
  3800. 1 bird 24.0
  3801. 2 mammal 80.5
  3802. 3 mammal NaN
  3803. You can also use `reset_index` with `MultiIndex`.
  3804. >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
  3805. ... ('bird', 'parrot'),
  3806. ... ('mammal', 'lion'),
  3807. ... ('mammal', 'monkey')],
  3808. ... names=['class', 'name'])
  3809. >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
  3810. ... ('species', 'type')])
  3811. >>> df = pd.DataFrame([(389.0, 'fly'),
  3812. ... ( 24.0, 'fly'),
  3813. ... ( 80.5, 'run'),
  3814. ... (np.nan, 'jump')],
  3815. ... index=index,
  3816. ... columns=columns)
  3817. >>> df
  3818. speed species
  3819. max type
  3820. class name
  3821. bird falcon 389.0 fly
  3822. parrot 24.0 fly
  3823. mammal lion 80.5 run
  3824. monkey NaN jump
  3825. If the index has multiple levels, we can reset a subset of them:
  3826. >>> df.reset_index(level='class')
  3827. class speed species
  3828. max type
  3829. name
  3830. falcon bird 389.0 fly
  3831. parrot bird 24.0 fly
  3832. lion mammal 80.5 run
  3833. monkey mammal NaN jump
  3834. If we are not dropping the index, by default, it is placed in the top
  3835. level. We can place it in another level:
  3836. >>> df.reset_index(level='class', col_level=1)
  3837. speed species
  3838. class max type
  3839. name
  3840. falcon bird 389.0 fly
  3841. parrot bird 24.0 fly
  3842. lion mammal 80.5 run
  3843. monkey mammal NaN jump
  3844. When the index is inserted under another level, we can specify under
  3845. which one with the parameter `col_fill`:
  3846. >>> df.reset_index(level='class', col_level=1, col_fill='species')
  3847. species speed species
  3848. class max type
  3849. name
  3850. falcon bird 389.0 fly
  3851. parrot bird 24.0 fly
  3852. lion mammal 80.5 run
  3853. monkey mammal NaN jump
  3854. If we specify a nonexistent level for `col_fill`, it is created:
  3855. >>> df.reset_index(level='class', col_level=1, col_fill='genus')
  3856. genus speed species
  3857. class max type
  3858. name
  3859. falcon bird 389.0 fly
  3860. parrot bird 24.0 fly
  3861. lion mammal 80.5 run
  3862. monkey mammal NaN jump
  3863. """
  3864. inplace = validate_bool_kwarg(inplace, "inplace")
  3865. if inplace:
  3866. new_obj = self
  3867. else:
  3868. new_obj = self.copy()
  3869. def _maybe_casted_values(index, labels=None):
  3870. values = index._values
  3871. if not isinstance(index, (PeriodIndex, DatetimeIndex)):
  3872. if values.dtype == np.object_:
  3873. values = lib.maybe_convert_objects(values)
  3874. # if we have the labels, extract the values with a mask
  3875. if labels is not None:
  3876. mask = labels == -1
  3877. # we can have situations where the whole mask is -1,
  3878. # meaning there is nothing found in labels, so make all nan's
  3879. if mask.all():
  3880. values = np.empty(len(mask))
  3881. values.fill(np.nan)
  3882. else:
  3883. values = values.take(labels)
  3884. # TODO(https://github.com/pandas-dev/pandas/issues/24206)
  3885. # Push this into maybe_upcast_putmask?
  3886. # We can't pass EAs there right now. Looks a bit
  3887. # complicated.
  3888. # So we unbox the ndarray_values, op, re-box.
  3889. values_type = type(values)
  3890. values_dtype = values.dtype
  3891. if issubclass(values_type, DatetimeLikeArray):
  3892. values = values._data
  3893. if mask.any():
  3894. values, _ = maybe_upcast_putmask(values, mask, np.nan)
  3895. if issubclass(values_type, DatetimeLikeArray):
  3896. values = values_type(values, dtype=values_dtype)
  3897. return values
  3898. new_index = ibase.default_index(len(new_obj))
  3899. if level is not None:
  3900. if not isinstance(level, (tuple, list)):
  3901. level = [level]
  3902. level = [self.index._get_level_number(lev) for lev in level]
  3903. if len(level) < self.index.nlevels:
  3904. new_index = self.index.droplevel(level)
  3905. if not drop:
  3906. to_insert: Iterable[Tuple[Any, Optional[Any]]]
  3907. if isinstance(self.index, ABCMultiIndex):
  3908. names = [
  3909. (n if n is not None else f"level_{i}")
  3910. for i, n in enumerate(self.index.names)
  3911. ]
  3912. to_insert = zip(self.index.levels, self.index.codes)
  3913. else:
  3914. default = "index" if "index" not in self else "level_0"
  3915. names = [default] if self.index.name is None else [self.index.name]
  3916. to_insert = ((self.index, None),)
  3917. multi_col = isinstance(self.columns, ABCMultiIndex)
  3918. for i, (lev, lab) in reversed(list(enumerate(to_insert))):
  3919. if not (level is None or i in level):
  3920. continue
  3921. name = names[i]
  3922. if multi_col:
  3923. col_name = list(name) if isinstance(name, tuple) else [name]
  3924. if col_fill is None:
  3925. if len(col_name) not in (1, self.columns.nlevels):
  3926. raise ValueError(
  3927. "col_fill=None is incompatible "
  3928. f"with incomplete column name {name}"
  3929. )
  3930. col_fill = col_name[0]
  3931. lev_num = self.columns._get_level_number(col_level)
  3932. name_lst = [col_fill] * lev_num + col_name
  3933. missing = self.columns.nlevels - len(name_lst)
  3934. name_lst += [col_fill] * missing
  3935. name = tuple(name_lst)
  3936. # to ndarray and maybe infer different dtype
  3937. level_values = _maybe_casted_values(lev, lab)
  3938. new_obj.insert(0, name, level_values)
  3939. new_obj.index = new_index
  3940. if not inplace:
  3941. return new_obj
  3942. return None
  3943. # ----------------------------------------------------------------------
  3944. # Reindex-based selection methods
  3945. @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
  3946. def isna(self) -> "DataFrame":
  3947. return super().isna()
  3948. @Appender(_shared_docs["isna"] % _shared_doc_kwargs)
  3949. def isnull(self) -> "DataFrame":
  3950. return super().isnull()
  3951. @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
  3952. def notna(self) -> "DataFrame":
  3953. return super().notna()
  3954. @Appender(_shared_docs["notna"] % _shared_doc_kwargs)
  3955. def notnull(self) -> "DataFrame":
  3956. return super().notnull()
  3957. def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False):
  3958. """
  3959. Remove missing values.
  3960. See the :ref:`User Guide <missing_data>` for more on which values are
  3961. considered missing, and how to work with missing data.
  3962. Parameters
  3963. ----------
  3964. axis : {0 or 'index', 1 or 'columns'}, default 0
  3965. Determine if rows or columns which contain missing values are
  3966. removed.
  3967. * 0, or 'index' : Drop rows which contain missing values.
  3968. * 1, or 'columns' : Drop columns which contain missing value.
  3969. .. versionchanged:: 1.0.0
  3970. Pass tuple or list to drop on multiple axes.
  3971. Only a single axis is allowed.
  3972. how : {'any', 'all'}, default 'any'
  3973. Determine if row or column is removed from DataFrame, when we have
  3974. at least one NA or all NA.
  3975. * 'any' : If any NA values are present, drop that row or column.
  3976. * 'all' : If all values are NA, drop that row or column.
  3977. thresh : int, optional
  3978. Require that many non-NA values.
  3979. subset : array-like, optional
  3980. Labels along other axis to consider, e.g. if you are dropping rows
  3981. these would be a list of columns to include.
  3982. inplace : bool, default False
  3983. If True, do operation inplace and return None.
  3984. Returns
  3985. -------
  3986. DataFrame
  3987. DataFrame with NA entries dropped from it.
  3988. See Also
  3989. --------
  3990. DataFrame.isna: Indicate missing values.
  3991. DataFrame.notna : Indicate existing (non-missing) values.
  3992. DataFrame.fillna : Replace missing values.
  3993. Series.dropna : Drop missing values.
  3994. Index.dropna : Drop missing indices.
  3995. Examples
  3996. --------
  3997. >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
  3998. ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
  3999. ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
  4000. ... pd.NaT]})
  4001. >>> df
  4002. name toy born
  4003. 0 Alfred NaN NaT
  4004. 1 Batman Batmobile 1940-04-25
  4005. 2 Catwoman Bullwhip NaT
  4006. Drop the rows where at least one element is missing.
  4007. >>> df.dropna()
  4008. name toy born
  4009. 1 Batman Batmobile 1940-04-25
  4010. Drop the columns where at least one element is missing.
  4011. >>> df.dropna(axis='columns')
  4012. name
  4013. 0 Alfred
  4014. 1 Batman
  4015. 2 Catwoman
  4016. Drop the rows where all elements are missing.
  4017. >>> df.dropna(how='all')
  4018. name toy born
  4019. 0 Alfred NaN NaT
  4020. 1 Batman Batmobile 1940-04-25
  4021. 2 Catwoman Bullwhip NaT
  4022. Keep only the rows with at least 2 non-NA values.
  4023. >>> df.dropna(thresh=2)
  4024. name toy born
  4025. 1 Batman Batmobile 1940-04-25
  4026. 2 Catwoman Bullwhip NaT
  4027. Define in which columns to look for missing values.
  4028. >>> df.dropna(subset=['name', 'born'])
  4029. name toy born
  4030. 1 Batman Batmobile 1940-04-25
  4031. Keep the DataFrame with valid entries in the same variable.
  4032. >>> df.dropna(inplace=True)
  4033. >>> df
  4034. name toy born
  4035. 1 Batman Batmobile 1940-04-25
  4036. """
  4037. inplace = validate_bool_kwarg(inplace, "inplace")
  4038. if isinstance(axis, (tuple, list)):
  4039. # GH20987
  4040. raise TypeError("supplying multiple axes to axis is no longer supported.")
  4041. axis = self._get_axis_number(axis)
  4042. agg_axis = 1 - axis
  4043. agg_obj = self
  4044. if subset is not None:
  4045. ax = self._get_axis(agg_axis)
  4046. indices = ax.get_indexer_for(subset)
  4047. check = indices == -1
  4048. if check.any():
  4049. raise KeyError(list(np.compress(check, subset)))
  4050. agg_obj = self.take(indices, axis=agg_axis)
  4051. count = agg_obj.count(axis=agg_axis)
  4052. if thresh is not None:
  4053. mask = count >= thresh
  4054. elif how == "any":
  4055. mask = count == len(agg_obj._get_axis(agg_axis))
  4056. elif how == "all":
  4057. mask = count > 0
  4058. else:
  4059. if how is not None:
  4060. raise ValueError(f"invalid how option: {how}")
  4061. else:
  4062. raise TypeError("must specify how or thresh")
  4063. result = self.loc(axis=axis)[mask]
  4064. if inplace:
  4065. self._update_inplace(result)
  4066. else:
  4067. return result
  4068. def drop_duplicates(
  4069. self,
  4070. subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
  4071. keep: Union[str, bool] = "first",
  4072. inplace: bool = False,
  4073. ignore_index: bool = False,
  4074. ) -> Optional["DataFrame"]:
  4075. """
  4076. Return DataFrame with duplicate rows removed.
  4077. Considering certain columns is optional. Indexes, including time indexes
  4078. are ignored.
  4079. Parameters
  4080. ----------
  4081. subset : column label or sequence of labels, optional
  4082. Only consider certain columns for identifying duplicates, by
  4083. default use all of the columns.
  4084. keep : {'first', 'last', False}, default 'first'
  4085. Determines which duplicates (if any) to keep.
  4086. - ``first`` : Drop duplicates except for the first occurrence.
  4087. - ``last`` : Drop duplicates except for the last occurrence.
  4088. - False : Drop all duplicates.
  4089. inplace : bool, default False
  4090. Whether to drop duplicates in place or to return a copy.
  4091. ignore_index : bool, default False
  4092. If True, the resulting axis will be labeled 0, 1, …, n - 1.
  4093. .. versionadded:: 1.0.0
  4094. Returns
  4095. -------
  4096. DataFrame
  4097. DataFrame with duplicates removed or None if ``inplace=True``.
  4098. """
  4099. if self.empty:
  4100. return self.copy()
  4101. inplace = validate_bool_kwarg(inplace, "inplace")
  4102. duplicated = self.duplicated(subset, keep=keep)
  4103. if inplace:
  4104. (inds,) = (-duplicated)._ndarray_values.nonzero()
  4105. new_data = self._data.take(inds)
  4106. if ignore_index:
  4107. new_data.axes[1] = ibase.default_index(len(inds))
  4108. self._update_inplace(new_data)
  4109. else:
  4110. result = self[-duplicated]
  4111. if ignore_index:
  4112. result.index = ibase.default_index(len(result))
  4113. return result
  4114. return None
  4115. def duplicated(
  4116. self,
  4117. subset: Optional[Union[Hashable, Sequence[Hashable]]] = None,
  4118. keep: Union[str, bool] = "first",
  4119. ) -> "Series":
  4120. """
  4121. Return boolean Series denoting duplicate rows.
  4122. Considering certain columns is optional.
  4123. Parameters
  4124. ----------
  4125. subset : column label or sequence of labels, optional
  4126. Only consider certain columns for identifying duplicates, by
  4127. default use all of the columns.
  4128. keep : {'first', 'last', False}, default 'first'
  4129. Determines which duplicates (if any) to mark.
  4130. - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
  4131. - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
  4132. - False : Mark all duplicates as ``True``.
  4133. Returns
  4134. -------
  4135. Series
  4136. """
  4137. from pandas.core.sorting import get_group_index
  4138. from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
  4139. if self.empty:
  4140. return Series(dtype=bool)
  4141. def f(vals):
  4142. labels, shape = algorithms.factorize(
  4143. vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)
  4144. )
  4145. return labels.astype("i8", copy=False), len(shape)
  4146. if subset is None:
  4147. subset = self.columns
  4148. elif (
  4149. not np.iterable(subset)
  4150. or isinstance(subset, str)
  4151. or isinstance(subset, tuple)
  4152. and subset in self.columns
  4153. ):
  4154. subset = (subset,)
  4155. # needed for mypy since can't narrow types using np.iterable
  4156. subset = cast(Iterable, subset)
  4157. # Verify all columns in subset exist in the queried dataframe
  4158. # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
  4159. # key that doesn't exist.
  4160. diff = Index(subset).difference(self.columns)
  4161. if not diff.empty:
  4162. raise KeyError(diff)
  4163. vals = (col.values for name, col in self.items() if name in subset)
  4164. labels, shape = map(list, zip(*map(f, vals)))
  4165. ids = get_group_index(labels, shape, sort=False, xnull=False)
  4166. return Series(duplicated_int64(ids, keep), index=self.index)
  4167. # ----------------------------------------------------------------------
  4168. # Sorting
  4169. @Substitution(**_shared_doc_kwargs)
  4170. @Appender(NDFrame.sort_values.__doc__)
  4171. def sort_values(
  4172. self,
  4173. by,
  4174. axis=0,
  4175. ascending=True,
  4176. inplace=False,
  4177. kind="quicksort",
  4178. na_position="last",
  4179. ignore_index=False,
  4180. ):
  4181. inplace = validate_bool_kwarg(inplace, "inplace")
  4182. axis = self._get_axis_number(axis)
  4183. if not isinstance(by, list):
  4184. by = [by]
  4185. if is_sequence(ascending) and len(by) != len(ascending):
  4186. raise ValueError(
  4187. f"Length of ascending ({len(ascending)}) != length of by ({len(by)})"
  4188. )
  4189. if len(by) > 1:
  4190. from pandas.core.sorting import lexsort_indexer
  4191. keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
  4192. indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position)
  4193. indexer = ensure_platform_int(indexer)
  4194. else:
  4195. from pandas.core.sorting import nargsort
  4196. by = by[0]
  4197. k = self._get_label_or_level_values(by, axis=axis)
  4198. if isinstance(ascending, (tuple, list)):
  4199. ascending = ascending[0]
  4200. indexer = nargsort(
  4201. k, kind=kind, ascending=ascending, na_position=na_position
  4202. )
  4203. new_data = self._data.take(
  4204. indexer, axis=self._get_block_manager_axis(axis), verify=False
  4205. )
  4206. if ignore_index:
  4207. new_data.axes[1] = ibase.default_index(len(indexer))
  4208. if inplace:
  4209. return self._update_inplace(new_data)
  4210. else:
  4211. return self._constructor(new_data).__finalize__(self)
  4212. @Substitution(**_shared_doc_kwargs)
  4213. @Appender(NDFrame.sort_index.__doc__)
  4214. def sort_index(
  4215. self,
  4216. axis=0,
  4217. level=None,
  4218. ascending=True,
  4219. inplace=False,
  4220. kind="quicksort",
  4221. na_position="last",
  4222. sort_remaining=True,
  4223. ignore_index: bool = False,
  4224. ):
  4225. # TODO: this can be combined with Series.sort_index impl as
  4226. # almost identical
  4227. inplace = validate_bool_kwarg(inplace, "inplace")
  4228. axis = self._get_axis_number(axis)
  4229. labels = self._get_axis(axis)
  4230. # make sure that the axis is lexsorted to start
  4231. # if not we need to reconstruct to get the correct indexer
  4232. labels = labels._sort_levels_monotonic()
  4233. if level is not None:
  4234. new_axis, indexer = labels.sortlevel(
  4235. level, ascending=ascending, sort_remaining=sort_remaining
  4236. )
  4237. elif isinstance(labels, ABCMultiIndex):
  4238. from pandas.core.sorting import lexsort_indexer
  4239. indexer = lexsort_indexer(
  4240. labels._get_codes_for_sorting(),
  4241. orders=ascending,
  4242. na_position=na_position,
  4243. )
  4244. else:
  4245. from pandas.core.sorting import nargsort
  4246. # Check monotonic-ness before sort an index
  4247. # GH11080
  4248. if (ascending and labels.is_monotonic_increasing) or (
  4249. not ascending and labels.is_monotonic_decreasing
  4250. ):
  4251. if inplace:
  4252. return
  4253. else:
  4254. return self.copy()
  4255. indexer = nargsort(
  4256. labels, kind=kind, ascending=ascending, na_position=na_position
  4257. )
  4258. baxis = self._get_block_manager_axis(axis)
  4259. new_data = self._data.take(indexer, axis=baxis, verify=False)
  4260. # reconstruct axis if needed
  4261. new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
  4262. if ignore_index:
  4263. new_data.axes[1] = ibase.default_index(len(indexer))
  4264. if inplace:
  4265. return self._update_inplace(new_data)
  4266. else:
  4267. return self._constructor(new_data).__finalize__(self)
  4268. def nlargest(self, n, columns, keep="first") -> "DataFrame":
  4269. """
  4270. Return the first `n` rows ordered by `columns` in descending order.
  4271. Return the first `n` rows with the largest values in `columns`, in
  4272. descending order. The columns that are not specified are returned as
  4273. well, but not used for ordering.
  4274. This method is equivalent to
  4275. ``df.sort_values(columns, ascending=False).head(n)``, but more
  4276. performant.
  4277. Parameters
  4278. ----------
  4279. n : int
  4280. Number of rows to return.
  4281. columns : label or list of labels
  4282. Column label(s) to order by.
  4283. keep : {'first', 'last', 'all'}, default 'first'
  4284. Where there are duplicate values:
  4285. - `first` : prioritize the first occurrence(s)
  4286. - `last` : prioritize the last occurrence(s)
  4287. - ``all`` : do not drop any duplicates, even it means
  4288. selecting more than `n` items.
  4289. .. versionadded:: 0.24.0
  4290. Returns
  4291. -------
  4292. DataFrame
  4293. The first `n` rows ordered by the given columns in descending
  4294. order.
  4295. See Also
  4296. --------
  4297. DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
  4298. ascending order.
  4299. DataFrame.sort_values : Sort DataFrame by the values.
  4300. DataFrame.head : Return the first `n` rows without re-ordering.
  4301. Notes
  4302. -----
  4303. This function cannot be used with all column types. For example, when
  4304. specifying columns with `object` or `category` dtypes, ``TypeError`` is
  4305. raised.
  4306. Examples
  4307. --------
  4308. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  4309. ... 434000, 434000, 337000, 11300,
  4310. ... 11300, 11300],
  4311. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  4312. ... 17036, 182, 38, 311],
  4313. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  4314. ... "IS", "NR", "TV", "AI"]},
  4315. ... index=["Italy", "France", "Malta",
  4316. ... "Maldives", "Brunei", "Iceland",
  4317. ... "Nauru", "Tuvalu", "Anguilla"])
  4318. >>> df
  4319. population GDP alpha-2
  4320. Italy 59000000 1937894 IT
  4321. France 65000000 2583560 FR
  4322. Malta 434000 12011 MT
  4323. Maldives 434000 4520 MV
  4324. Brunei 434000 12128 BN
  4325. Iceland 337000 17036 IS
  4326. Nauru 11300 182 NR
  4327. Tuvalu 11300 38 TV
  4328. Anguilla 11300 311 AI
  4329. In the following example, we will use ``nlargest`` to select the three
  4330. rows having the largest values in column "population".
  4331. >>> df.nlargest(3, 'population')
  4332. population GDP alpha-2
  4333. France 65000000 2583560 FR
  4334. Italy 59000000 1937894 IT
  4335. Malta 434000 12011 MT
  4336. When using ``keep='last'``, ties are resolved in reverse order:
  4337. >>> df.nlargest(3, 'population', keep='last')
  4338. population GDP alpha-2
  4339. France 65000000 2583560 FR
  4340. Italy 59000000 1937894 IT
  4341. Brunei 434000 12128 BN
  4342. When using ``keep='all'``, all duplicate items are maintained:
  4343. >>> df.nlargest(3, 'population', keep='all')
  4344. population GDP alpha-2
  4345. France 65000000 2583560 FR
  4346. Italy 59000000 1937894 IT
  4347. Malta 434000 12011 MT
  4348. Maldives 434000 4520 MV
  4349. Brunei 434000 12128 BN
  4350. To order by the largest values in column "population" and then "GDP",
  4351. we can specify multiple columns like in the next example.
  4352. >>> df.nlargest(3, ['population', 'GDP'])
  4353. population GDP alpha-2
  4354. France 65000000 2583560 FR
  4355. Italy 59000000 1937894 IT
  4356. Brunei 434000 12128 BN
  4357. """
  4358. return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
  4359. def nsmallest(self, n, columns, keep="first") -> "DataFrame":
  4360. """
  4361. Return the first `n` rows ordered by `columns` in ascending order.
  4362. Return the first `n` rows with the smallest values in `columns`, in
  4363. ascending order. The columns that are not specified are returned as
  4364. well, but not used for ordering.
  4365. This method is equivalent to
  4366. ``df.sort_values(columns, ascending=True).head(n)``, but more
  4367. performant.
  4368. Parameters
  4369. ----------
  4370. n : int
  4371. Number of items to retrieve.
  4372. columns : list or str
  4373. Column name or names to order by.
  4374. keep : {'first', 'last', 'all'}, default 'first'
  4375. Where there are duplicate values:
  4376. - ``first`` : take the first occurrence.
  4377. - ``last`` : take the last occurrence.
  4378. - ``all`` : do not drop any duplicates, even it means
  4379. selecting more than `n` items.
  4380. .. versionadded:: 0.24.0
  4381. Returns
  4382. -------
  4383. DataFrame
  4384. See Also
  4385. --------
  4386. DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
  4387. descending order.
  4388. DataFrame.sort_values : Sort DataFrame by the values.
  4389. DataFrame.head : Return the first `n` rows without re-ordering.
  4390. Examples
  4391. --------
  4392. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  4393. ... 434000, 434000, 337000, 11300,
  4394. ... 11300, 11300],
  4395. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  4396. ... 17036, 182, 38, 311],
  4397. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  4398. ... "IS", "NR", "TV", "AI"]},
  4399. ... index=["Italy", "France", "Malta",
  4400. ... "Maldives", "Brunei", "Iceland",
  4401. ... "Nauru", "Tuvalu", "Anguilla"])
  4402. >>> df
  4403. population GDP alpha-2
  4404. Italy 59000000 1937894 IT
  4405. France 65000000 2583560 FR
  4406. Malta 434000 12011 MT
  4407. Maldives 434000 4520 MV
  4408. Brunei 434000 12128 BN
  4409. Iceland 337000 17036 IS
  4410. Nauru 11300 182 NR
  4411. Tuvalu 11300 38 TV
  4412. Anguilla 11300 311 AI
  4413. In the following example, we will use ``nsmallest`` to select the
  4414. three rows having the smallest values in column "a".
  4415. >>> df.nsmallest(3, 'population')
  4416. population GDP alpha-2
  4417. Nauru 11300 182 NR
  4418. Tuvalu 11300 38 TV
  4419. Anguilla 11300 311 AI
  4420. When using ``keep='last'``, ties are resolved in reverse order:
  4421. >>> df.nsmallest(3, 'population', keep='last')
  4422. population GDP alpha-2
  4423. Anguilla 11300 311 AI
  4424. Tuvalu 11300 38 TV
  4425. Nauru 11300 182 NR
  4426. When using ``keep='all'``, all duplicate items are maintained:
  4427. >>> df.nsmallest(3, 'population', keep='all')
  4428. population GDP alpha-2
  4429. Nauru 11300 182 NR
  4430. Tuvalu 11300 38 TV
  4431. Anguilla 11300 311 AI
  4432. To order by the largest values in column "a" and then "c", we can
  4433. specify multiple columns like in the next example.
  4434. >>> df.nsmallest(3, ['population', 'GDP'])
  4435. population GDP alpha-2
  4436. Tuvalu 11300 38 TV
  4437. Nauru 11300 182 NR
  4438. Anguilla 11300 311 AI
  4439. """
  4440. return algorithms.SelectNFrame(
  4441. self, n=n, keep=keep, columns=columns
  4442. ).nsmallest()
  4443. def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame":
  4444. """
  4445. Swap levels i and j in a MultiIndex on a particular axis.
  4446. Parameters
  4447. ----------
  4448. i, j : int or str
  4449. Levels of the indices to be swapped. Can pass level name as string.
  4450. Returns
  4451. -------
  4452. DataFrame
  4453. """
  4454. result = self.copy()
  4455. axis = self._get_axis_number(axis)
  4456. if axis == 0:
  4457. result.index = result.index.swaplevel(i, j)
  4458. else:
  4459. result.columns = result.columns.swaplevel(i, j)
  4460. return result
  4461. def reorder_levels(self, order, axis=0) -> "DataFrame":
  4462. """
  4463. Rearrange index levels using input order. May not drop or duplicate levels.
  4464. Parameters
  4465. ----------
  4466. order : list of int or list of str
  4467. List representing new level order. Reference level by number
  4468. (position) or by key (label).
  4469. axis : int
  4470. Where to reorder levels.
  4471. Returns
  4472. -------
  4473. DataFrame
  4474. """
  4475. axis = self._get_axis_number(axis)
  4476. if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover
  4477. raise TypeError("Can only reorder levels on a hierarchical axis.")
  4478. result = self.copy()
  4479. if axis == 0:
  4480. result.index = result.index.reorder_levels(order)
  4481. else:
  4482. result.columns = result.columns.reorder_levels(order)
  4483. return result
  4484. # ----------------------------------------------------------------------
  4485. # Arithmetic / combination related
  4486. def _combine_frame(self, other, func, fill_value=None, level=None):
  4487. # at this point we have `self._indexed_same(other)`
  4488. if fill_value is None:
  4489. # since _arith_op may be called in a loop, avoid function call
  4490. # overhead if possible by doing this check once
  4491. _arith_op = func
  4492. else:
  4493. def _arith_op(left, right):
  4494. # for the mixed_type case where we iterate over columns,
  4495. # _arith_op(left, right) is equivalent to
  4496. # left._binop(right, func, fill_value=fill_value)
  4497. left, right = ops.fill_binop(left, right, fill_value)
  4498. return func(left, right)
  4499. if ops.should_series_dispatch(self, other, func):
  4500. # iterate over columns
  4501. new_data = ops.dispatch_to_series(self, other, _arith_op)
  4502. else:
  4503. with np.errstate(all="ignore"):
  4504. res_values = _arith_op(self.values, other.values)
  4505. new_data = dispatch_fill_zeros(func, self.values, other.values, res_values)
  4506. return new_data
  4507. def _combine_match_index(self, other, func):
  4508. # at this point we have `self.index.equals(other.index)`
  4509. if ops.should_series_dispatch(self, other, func):
  4510. # operate column-wise; avoid costly object-casting in `.values`
  4511. new_data = ops.dispatch_to_series(self, other, func)
  4512. else:
  4513. # fastpath --> operate directly on values
  4514. with np.errstate(all="ignore"):
  4515. new_data = func(self.values.T, other.values).T
  4516. return new_data
  4517. def _construct_result(self, result) -> "DataFrame":
  4518. """
  4519. Wrap the result of an arithmetic, comparison, or logical operation.
  4520. Parameters
  4521. ----------
  4522. result : DataFrame
  4523. Returns
  4524. -------
  4525. DataFrame
  4526. """
  4527. out = self._constructor(result, index=self.index, copy=False)
  4528. # Pin columns instead of passing to constructor for compat with
  4529. # non-unique columns case
  4530. out.columns = self.columns
  4531. return out
  4532. def combine(
  4533. self, other: "DataFrame", func, fill_value=None, overwrite=True
  4534. ) -> "DataFrame":
  4535. """
  4536. Perform column-wise combine with another DataFrame.
  4537. Combines a DataFrame with `other` DataFrame using `func`
  4538. to element-wise combine columns. The row and column indexes of the
  4539. resulting DataFrame will be the union of the two.
  4540. Parameters
  4541. ----------
  4542. other : DataFrame
  4543. The DataFrame to merge column-wise.
  4544. func : function
  4545. Function that takes two series as inputs and return a Series or a
  4546. scalar. Used to merge the two dataframes column by columns.
  4547. fill_value : scalar value, default None
  4548. The value to fill NaNs with prior to passing any column to the
  4549. merge func.
  4550. overwrite : bool, default True
  4551. If True, columns in `self` that do not exist in `other` will be
  4552. overwritten with NaNs.
  4553. Returns
  4554. -------
  4555. DataFrame
  4556. Combination of the provided DataFrames.
  4557. See Also
  4558. --------
  4559. DataFrame.combine_first : Combine two DataFrame objects and default to
  4560. non-null values in frame calling the method.
  4561. Examples
  4562. --------
  4563. Combine using a simple function that chooses the smaller column.
  4564. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  4565. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4566. >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
  4567. >>> df1.combine(df2, take_smaller)
  4568. A B
  4569. 0 0 3
  4570. 1 0 3
  4571. Example using a true element-wise combine function.
  4572. >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
  4573. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4574. >>> df1.combine(df2, np.minimum)
  4575. A B
  4576. 0 1 2
  4577. 1 0 3
  4578. Using `fill_value` fills Nones prior to passing the column to the
  4579. merge function.
  4580. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  4581. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4582. >>> df1.combine(df2, take_smaller, fill_value=-5)
  4583. A B
  4584. 0 0 -5.0
  4585. 1 0 4.0
  4586. However, if the same element in both dataframes is None, that None
  4587. is preserved
  4588. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  4589. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
  4590. >>> df1.combine(df2, take_smaller, fill_value=-5)
  4591. A B
  4592. 0 0 -5.0
  4593. 1 0 3.0
  4594. Example that demonstrates the use of `overwrite` and behavior when
  4595. the axis differ between the dataframes.
  4596. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  4597. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
  4598. >>> df1.combine(df2, take_smaller)
  4599. A B C
  4600. 0 NaN NaN NaN
  4601. 1 NaN 3.0 -10.0
  4602. 2 NaN 3.0 1.0
  4603. >>> df1.combine(df2, take_smaller, overwrite=False)
  4604. A B C
  4605. 0 0.0 NaN NaN
  4606. 1 0.0 3.0 -10.0
  4607. 2 NaN 3.0 1.0
  4608. Demonstrating the preference of the passed in dataframe.
  4609. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
  4610. >>> df2.combine(df1, take_smaller)
  4611. A B C
  4612. 0 0.0 NaN NaN
  4613. 1 0.0 3.0 NaN
  4614. 2 NaN 3.0 NaN
  4615. >>> df2.combine(df1, take_smaller, overwrite=False)
  4616. A B C
  4617. 0 0.0 NaN NaN
  4618. 1 0.0 3.0 1.0
  4619. 2 NaN 3.0 1.0
  4620. """
  4621. other_idxlen = len(other.index) # save for compare
  4622. this, other = self.align(other, copy=False)
  4623. new_index = this.index
  4624. if other.empty and len(new_index) == len(self.index):
  4625. return self.copy()
  4626. if self.empty and len(other) == other_idxlen:
  4627. return other.copy()
  4628. # sorts if possible
  4629. new_columns = this.columns.union(other.columns)
  4630. do_fill = fill_value is not None
  4631. result = {}
  4632. for col in new_columns:
  4633. series = this[col]
  4634. otherSeries = other[col]
  4635. this_dtype = series.dtype
  4636. other_dtype = otherSeries.dtype
  4637. this_mask = isna(series)
  4638. other_mask = isna(otherSeries)
  4639. # don't overwrite columns unnecessarily
  4640. # DO propagate if this column is not in the intersection
  4641. if not overwrite and other_mask.all():
  4642. result[col] = this[col].copy()
  4643. continue
  4644. if do_fill:
  4645. series = series.copy()
  4646. otherSeries = otherSeries.copy()
  4647. series[this_mask] = fill_value
  4648. otherSeries[other_mask] = fill_value
  4649. if col not in self.columns:
  4650. # If self DataFrame does not have col in other DataFrame,
  4651. # try to promote series, which is all NaN, as other_dtype.
  4652. new_dtype = other_dtype
  4653. try:
  4654. series = series.astype(new_dtype, copy=False)
  4655. except ValueError:
  4656. # e.g. new_dtype is integer types
  4657. pass
  4658. else:
  4659. # if we have different dtypes, possibly promote
  4660. new_dtype = find_common_type([this_dtype, other_dtype])
  4661. if not is_dtype_equal(this_dtype, new_dtype):
  4662. series = series.astype(new_dtype)
  4663. if not is_dtype_equal(other_dtype, new_dtype):
  4664. otherSeries = otherSeries.astype(new_dtype)
  4665. arr = func(series, otherSeries)
  4666. arr = maybe_downcast_to_dtype(arr, this_dtype)
  4667. result[col] = arr
  4668. # convert_objects just in case
  4669. return self._constructor(result, index=new_index, columns=new_columns)
  4670. def combine_first(self, other: "DataFrame") -> "DataFrame":
  4671. """
  4672. Update null elements with value in the same location in `other`.
  4673. Combine two DataFrame objects by filling null values in one DataFrame
  4674. with non-null values from other DataFrame. The row and column indexes
  4675. of the resulting DataFrame will be the union of the two.
  4676. Parameters
  4677. ----------
  4678. other : DataFrame
  4679. Provided DataFrame to use to fill null values.
  4680. Returns
  4681. -------
  4682. DataFrame
  4683. See Also
  4684. --------
  4685. DataFrame.combine : Perform series-wise operation on two DataFrames
  4686. using a given function.
  4687. Examples
  4688. --------
  4689. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
  4690. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  4691. >>> df1.combine_first(df2)
  4692. A B
  4693. 0 1.0 3.0
  4694. 1 0.0 4.0
  4695. Null values still persist if the location of that null value
  4696. does not exist in `other`
  4697. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
  4698. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
  4699. >>> df1.combine_first(df2)
  4700. A B C
  4701. 0 NaN 4.0 NaN
  4702. 1 0.0 3.0 1.0
  4703. 2 NaN 3.0 1.0
  4704. """
  4705. import pandas.core.computation.expressions as expressions
  4706. def extract_values(arr):
  4707. # Does two things:
  4708. # 1. maybe gets the values from the Series / Index
  4709. # 2. convert datelike to i8
  4710. if isinstance(arr, (ABCIndexClass, ABCSeries)):
  4711. arr = arr._values
  4712. if needs_i8_conversion(arr):
  4713. if is_extension_array_dtype(arr.dtype):
  4714. arr = arr.asi8
  4715. else:
  4716. arr = arr.view("i8")
  4717. return arr
  4718. def combiner(x, y):
  4719. mask = isna(x)
  4720. if isinstance(mask, (ABCIndexClass, ABCSeries)):
  4721. mask = mask._values
  4722. x_values = extract_values(x)
  4723. y_values = extract_values(y)
  4724. # If the column y in other DataFrame is not in first DataFrame,
  4725. # just return y_values.
  4726. if y.name not in self.columns:
  4727. return y_values
  4728. return expressions.where(mask, y_values, x_values)
  4729. return self.combine(other, combiner, overwrite=False)
  4730. def update(
  4731. self, other, join="left", overwrite=True, filter_func=None, errors="ignore"
  4732. ) -> None:
  4733. """
  4734. Modify in place using non-NA values from another DataFrame.
  4735. Aligns on indices. There is no return value.
  4736. Parameters
  4737. ----------
  4738. other : DataFrame, or object coercible into a DataFrame
  4739. Should have at least one matching index/column label
  4740. with the original DataFrame. If a Series is passed,
  4741. its name attribute must be set, and that will be
  4742. used as the column name to align with the original DataFrame.
  4743. join : {'left'}, default 'left'
  4744. Only left join is implemented, keeping the index and columns of the
  4745. original object.
  4746. overwrite : bool, default True
  4747. How to handle non-NA values for overlapping keys:
  4748. * True: overwrite original DataFrame's values
  4749. with values from `other`.
  4750. * False: only update values that are NA in
  4751. the original DataFrame.
  4752. filter_func : callable(1d-array) -> bool 1d-array, optional
  4753. Can choose to replace values other than NA. Return True for values
  4754. that should be updated.
  4755. errors : {'raise', 'ignore'}, default 'ignore'
  4756. If 'raise', will raise a ValueError if the DataFrame and `other`
  4757. both contain non-NA data in the same place.
  4758. .. versionchanged:: 0.24.0
  4759. Changed from `raise_conflict=False|True`
  4760. to `errors='ignore'|'raise'`.
  4761. Returns
  4762. -------
  4763. None : method directly changes calling object
  4764. Raises
  4765. ------
  4766. ValueError
  4767. * When `errors='raise'` and there's overlapping non-NA data.
  4768. * When `errors` is not either `'ignore'` or `'raise'`
  4769. NotImplementedError
  4770. * If `join != 'left'`
  4771. See Also
  4772. --------
  4773. dict.update : Similar method for dictionaries.
  4774. DataFrame.merge : For column(s)-on-columns(s) operations.
  4775. Examples
  4776. --------
  4777. >>> df = pd.DataFrame({'A': [1, 2, 3],
  4778. ... 'B': [400, 500, 600]})
  4779. >>> new_df = pd.DataFrame({'B': [4, 5, 6],
  4780. ... 'C': [7, 8, 9]})
  4781. >>> df.update(new_df)
  4782. >>> df
  4783. A B
  4784. 0 1 4
  4785. 1 2 5
  4786. 2 3 6
  4787. The DataFrame's length does not increase as a result of the update,
  4788. only values at matching index/column labels are updated.
  4789. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  4790. ... 'B': ['x', 'y', 'z']})
  4791. >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
  4792. >>> df.update(new_df)
  4793. >>> df
  4794. A B
  4795. 0 a d
  4796. 1 b e
  4797. 2 c f
  4798. For Series, it's name attribute must be set.
  4799. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  4800. ... 'B': ['x', 'y', 'z']})
  4801. >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
  4802. >>> df.update(new_column)
  4803. >>> df
  4804. A B
  4805. 0 a d
  4806. 1 b y
  4807. 2 c e
  4808. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  4809. ... 'B': ['x', 'y', 'z']})
  4810. >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
  4811. >>> df.update(new_df)
  4812. >>> df
  4813. A B
  4814. 0 a x
  4815. 1 b d
  4816. 2 c e
  4817. If `other` contains NaNs the corresponding values are not updated
  4818. in the original dataframe.
  4819. >>> df = pd.DataFrame({'A': [1, 2, 3],
  4820. ... 'B': [400, 500, 600]})
  4821. >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
  4822. >>> df.update(new_df)
  4823. >>> df
  4824. A B
  4825. 0 1 4.0
  4826. 1 2 500.0
  4827. 2 3 6.0
  4828. """
  4829. import pandas.core.computation.expressions as expressions
  4830. # TODO: Support other joins
  4831. if join != "left": # pragma: no cover
  4832. raise NotImplementedError("Only left join is supported")
  4833. if errors not in ["ignore", "raise"]:
  4834. raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
  4835. if not isinstance(other, DataFrame):
  4836. other = DataFrame(other)
  4837. other = other.reindex_like(self)
  4838. for col in self.columns:
  4839. this = self[col]._values
  4840. that = other[col]._values
  4841. if filter_func is not None:
  4842. with np.errstate(all="ignore"):
  4843. mask = ~filter_func(this) | isna(that)
  4844. else:
  4845. if errors == "raise":
  4846. mask_this = notna(that)
  4847. mask_that = notna(this)
  4848. if any(mask_this & mask_that):
  4849. raise ValueError("Data overlaps.")
  4850. if overwrite:
  4851. mask = isna(that)
  4852. else:
  4853. mask = notna(this)
  4854. # don't overwrite columns unnecessarily
  4855. if mask.all():
  4856. continue
  4857. self[col] = expressions.where(mask, this, that)
  4858. # ----------------------------------------------------------------------
  4859. # Data reshaping
  4860. @Appender(
  4861. """
  4862. Examples
  4863. --------
  4864. >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
  4865. ... 'Parrot', 'Parrot'],
  4866. ... 'Max Speed': [380., 370., 24., 26.]})
  4867. >>> df
  4868. Animal Max Speed
  4869. 0 Falcon 380.0
  4870. 1 Falcon 370.0
  4871. 2 Parrot 24.0
  4872. 3 Parrot 26.0
  4873. >>> df.groupby(['Animal']).mean()
  4874. Max Speed
  4875. Animal
  4876. Falcon 375.0
  4877. Parrot 25.0
  4878. **Hierarchical Indexes**
  4879. We can groupby different levels of a hierarchical index
  4880. using the `level` parameter:
  4881. >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
  4882. ... ['Captive', 'Wild', 'Captive', 'Wild']]
  4883. >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
  4884. >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
  4885. ... index=index)
  4886. >>> df
  4887. Max Speed
  4888. Animal Type
  4889. Falcon Captive 390.0
  4890. Wild 350.0
  4891. Parrot Captive 30.0
  4892. Wild 20.0
  4893. >>> df.groupby(level=0).mean()
  4894. Max Speed
  4895. Animal
  4896. Falcon 370.0
  4897. Parrot 25.0
  4898. >>> df.groupby(level="Type").mean()
  4899. Max Speed
  4900. Type
  4901. Captive 210.0
  4902. Wild 185.0
  4903. """
  4904. )
  4905. @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
  4906. def groupby(
  4907. self,
  4908. by=None,
  4909. axis=0,
  4910. level=None,
  4911. as_index: bool = True,
  4912. sort: bool = True,
  4913. group_keys: bool = True,
  4914. squeeze: bool = False,
  4915. observed: bool = False,
  4916. ) -> "groupby_generic.DataFrameGroupBy":
  4917. if level is None and by is None:
  4918. raise TypeError("You have to supply one of 'by' and 'level'")
  4919. axis = self._get_axis_number(axis)
  4920. return groupby_generic.DataFrameGroupBy(
  4921. obj=self,
  4922. keys=by,
  4923. axis=axis,
  4924. level=level,
  4925. as_index=as_index,
  4926. sort=sort,
  4927. group_keys=group_keys,
  4928. squeeze=squeeze,
  4929. observed=observed,
  4930. )
  4931. _shared_docs[
  4932. "pivot"
  4933. ] = """
  4934. Return reshaped DataFrame organized by given index / column values.
  4935. Reshape data (produce a "pivot" table) based on column values. Uses
  4936. unique values from specified `index` / `columns` to form axes of the
  4937. resulting DataFrame. This function does not support data
  4938. aggregation, multiple values will result in a MultiIndex in the
  4939. columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
  4940. Parameters
  4941. ----------%s
  4942. index : str or object, optional
  4943. Column to use to make new frame's index. If None, uses
  4944. existing index.
  4945. columns : str or object
  4946. Column to use to make new frame's columns.
  4947. values : str, object or a list of the previous, optional
  4948. Column(s) to use for populating new frame's values. If not
  4949. specified, all remaining columns will be used and the result will
  4950. have hierarchically indexed columns.
  4951. .. versionchanged:: 0.23.0
  4952. Also accept list of column names.
  4953. Returns
  4954. -------
  4955. DataFrame
  4956. Returns reshaped DataFrame.
  4957. Raises
  4958. ------
  4959. ValueError:
  4960. When there are any `index`, `columns` combinations with multiple
  4961. values. `DataFrame.pivot_table` when you need to aggregate.
  4962. See Also
  4963. --------
  4964. DataFrame.pivot_table : Generalization of pivot that can handle
  4965. duplicate values for one index/column pair.
  4966. DataFrame.unstack : Pivot based on the index values instead of a
  4967. column.
  4968. Notes
  4969. -----
  4970. For finer-tuned control, see hierarchical indexing documentation along
  4971. with the related stack/unstack methods.
  4972. Examples
  4973. --------
  4974. >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
  4975. ... 'two'],
  4976. ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
  4977. ... 'baz': [1, 2, 3, 4, 5, 6],
  4978. ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
  4979. >>> df
  4980. foo bar baz zoo
  4981. 0 one A 1 x
  4982. 1 one B 2 y
  4983. 2 one C 3 z
  4984. 3 two A 4 q
  4985. 4 two B 5 w
  4986. 5 two C 6 t
  4987. >>> df.pivot(index='foo', columns='bar', values='baz')
  4988. bar A B C
  4989. foo
  4990. one 1 2 3
  4991. two 4 5 6
  4992. >>> df.pivot(index='foo', columns='bar')['baz']
  4993. bar A B C
  4994. foo
  4995. one 1 2 3
  4996. two 4 5 6
  4997. >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
  4998. baz zoo
  4999. bar A B C A B C
  5000. foo
  5001. one 1 2 3 x y z
  5002. two 4 5 6 q w t
  5003. A ValueError is raised if there are any duplicates.
  5004. >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
  5005. ... "bar": ['A', 'A', 'B', 'C'],
  5006. ... "baz": [1, 2, 3, 4]})
  5007. >>> df
  5008. foo bar baz
  5009. 0 one A 1
  5010. 1 one A 2
  5011. 2 two B 3
  5012. 3 two C 4
  5013. Notice that the first two rows are the same for our `index`
  5014. and `columns` arguments.
  5015. >>> df.pivot(index='foo', columns='bar', values='baz')
  5016. Traceback (most recent call last):
  5017. ...
  5018. ValueError: Index contains duplicate entries, cannot reshape
  5019. """
  5020. @Substitution("")
  5021. @Appender(_shared_docs["pivot"])
  5022. def pivot(self, index=None, columns=None, values=None) -> "DataFrame":
  5023. from pandas.core.reshape.pivot import pivot
  5024. return pivot(self, index=index, columns=columns, values=values)
  5025. _shared_docs[
  5026. "pivot_table"
  5027. ] = """
  5028. Create a spreadsheet-style pivot table as a DataFrame.
  5029. The levels in the pivot table will be stored in MultiIndex objects
  5030. (hierarchical indexes) on the index and columns of the result DataFrame.
  5031. Parameters
  5032. ----------%s
  5033. values : column to aggregate, optional
  5034. index : column, Grouper, array, or list of the previous
  5035. If an array is passed, it must be the same length as the data. The
  5036. list can contain any of the other types (except list).
  5037. Keys to group by on the pivot table index. If an array is passed,
  5038. it is being used as the same manner as column values.
  5039. columns : column, Grouper, array, or list of the previous
  5040. If an array is passed, it must be the same length as the data. The
  5041. list can contain any of the other types (except list).
  5042. Keys to group by on the pivot table column. If an array is passed,
  5043. it is being used as the same manner as column values.
  5044. aggfunc : function, list of functions, dict, default numpy.mean
  5045. If list of functions passed, the resulting pivot table will have
  5046. hierarchical columns whose top level are the function names
  5047. (inferred from the function objects themselves)
  5048. If dict is passed, the key is column to aggregate and value
  5049. is function or list of functions.
  5050. fill_value : scalar, default None
  5051. Value to replace missing values with.
  5052. margins : bool, default False
  5053. Add all row / columns (e.g. for subtotal / grand totals).
  5054. dropna : bool, default True
  5055. Do not include columns whose entries are all NaN.
  5056. margins_name : str, default 'All'
  5057. Name of the row / column that will contain the totals
  5058. when margins is True.
  5059. observed : bool, default False
  5060. This only applies if any of the groupers are Categoricals.
  5061. If True: only show observed values for categorical groupers.
  5062. If False: show all values for categorical groupers.
  5063. .. versionchanged:: 0.25.0
  5064. Returns
  5065. -------
  5066. DataFrame
  5067. An Excel style pivot table.
  5068. See Also
  5069. --------
  5070. DataFrame.pivot : Pivot without aggregation that can handle
  5071. non-numeric data.
  5072. Examples
  5073. --------
  5074. >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
  5075. ... "bar", "bar", "bar", "bar"],
  5076. ... "B": ["one", "one", "one", "two", "two",
  5077. ... "one", "one", "two", "two"],
  5078. ... "C": ["small", "large", "large", "small",
  5079. ... "small", "large", "small", "small",
  5080. ... "large"],
  5081. ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
  5082. ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
  5083. >>> df
  5084. A B C D E
  5085. 0 foo one small 1 2
  5086. 1 foo one large 2 4
  5087. 2 foo one large 2 5
  5088. 3 foo two small 3 5
  5089. 4 foo two small 3 6
  5090. 5 bar one large 4 6
  5091. 6 bar one small 5 8
  5092. 7 bar two small 6 9
  5093. 8 bar two large 7 9
  5094. This first example aggregates values by taking the sum.
  5095. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
  5096. ... columns=['C'], aggfunc=np.sum)
  5097. >>> table
  5098. C large small
  5099. A B
  5100. bar one 4.0 5.0
  5101. two 7.0 6.0
  5102. foo one 4.0 1.0
  5103. two NaN 6.0
  5104. We can also fill missing values using the `fill_value` parameter.
  5105. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
  5106. ... columns=['C'], aggfunc=np.sum, fill_value=0)
  5107. >>> table
  5108. C large small
  5109. A B
  5110. bar one 4 5
  5111. two 7 6
  5112. foo one 4 1
  5113. two 0 6
  5114. The next example aggregates by taking the mean across multiple columns.
  5115. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  5116. ... aggfunc={'D': np.mean,
  5117. ... 'E': np.mean})
  5118. >>> table
  5119. D E
  5120. A C
  5121. bar large 5.500000 7.500000
  5122. small 5.500000 8.500000
  5123. foo large 2.000000 4.500000
  5124. small 2.333333 4.333333
  5125. We can also calculate multiple types of aggregations for any given
  5126. value column.
  5127. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  5128. ... aggfunc={'D': np.mean,
  5129. ... 'E': [min, max, np.mean]})
  5130. >>> table
  5131. D E
  5132. mean max mean min
  5133. A C
  5134. bar large 5.500000 9.0 7.500000 6.0
  5135. small 5.500000 9.0 8.500000 8.0
  5136. foo large 2.000000 5.0 4.500000 4.0
  5137. small 2.333333 6.0 4.333333 2.0
  5138. """
  5139. @Substitution("")
  5140. @Appender(_shared_docs["pivot_table"])
  5141. def pivot_table(
  5142. self,
  5143. values=None,
  5144. index=None,
  5145. columns=None,
  5146. aggfunc="mean",
  5147. fill_value=None,
  5148. margins=False,
  5149. dropna=True,
  5150. margins_name="All",
  5151. observed=False,
  5152. ) -> "DataFrame":
  5153. from pandas.core.reshape.pivot import pivot_table
  5154. return pivot_table(
  5155. self,
  5156. values=values,
  5157. index=index,
  5158. columns=columns,
  5159. aggfunc=aggfunc,
  5160. fill_value=fill_value,
  5161. margins=margins,
  5162. dropna=dropna,
  5163. margins_name=margins_name,
  5164. observed=observed,
  5165. )
  5166. def stack(self, level=-1, dropna=True):
  5167. """
  5168. Stack the prescribed level(s) from columns to index.
  5169. Return a reshaped DataFrame or Series having a multi-level
  5170. index with one or more new inner-most levels compared to the current
  5171. DataFrame. The new inner-most levels are created by pivoting the
  5172. columns of the current dataframe:
  5173. - if the columns have a single level, the output is a Series;
  5174. - if the columns have multiple levels, the new index
  5175. level(s) is (are) taken from the prescribed level(s) and
  5176. the output is a DataFrame.
  5177. The new index levels are sorted.
  5178. Parameters
  5179. ----------
  5180. level : int, str, list, default -1
  5181. Level(s) to stack from the column axis onto the index
  5182. axis, defined as one index or label, or a list of indices
  5183. or labels.
  5184. dropna : bool, default True
  5185. Whether to drop rows in the resulting Frame/Series with
  5186. missing values. Stacking a column level onto the index
  5187. axis can create combinations of index and column values
  5188. that are missing from the original dataframe. See Examples
  5189. section.
  5190. Returns
  5191. -------
  5192. DataFrame or Series
  5193. Stacked dataframe or series.
  5194. See Also
  5195. --------
  5196. DataFrame.unstack : Unstack prescribed level(s) from index axis
  5197. onto column axis.
  5198. DataFrame.pivot : Reshape dataframe from long format to wide
  5199. format.
  5200. DataFrame.pivot_table : Create a spreadsheet-style pivot table
  5201. as a DataFrame.
  5202. Notes
  5203. -----
  5204. The function is named by analogy with a collection of books
  5205. being reorganized from being side by side on a horizontal
  5206. position (the columns of the dataframe) to being stacked
  5207. vertically on top of each other (in the index of the
  5208. dataframe).
  5209. Examples
  5210. --------
  5211. **Single level columns**
  5212. >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
  5213. ... index=['cat', 'dog'],
  5214. ... columns=['weight', 'height'])
  5215. Stacking a dataframe with a single level column axis returns a Series:
  5216. >>> df_single_level_cols
  5217. weight height
  5218. cat 0 1
  5219. dog 2 3
  5220. >>> df_single_level_cols.stack()
  5221. cat weight 0
  5222. height 1
  5223. dog weight 2
  5224. height 3
  5225. dtype: int64
  5226. **Multi level columns: simple case**
  5227. >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  5228. ... ('weight', 'pounds')])
  5229. >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
  5230. ... index=['cat', 'dog'],
  5231. ... columns=multicol1)
  5232. Stacking a dataframe with a multi-level column axis:
  5233. >>> df_multi_level_cols1
  5234. weight
  5235. kg pounds
  5236. cat 1 2
  5237. dog 2 4
  5238. >>> df_multi_level_cols1.stack()
  5239. weight
  5240. cat kg 1
  5241. pounds 2
  5242. dog kg 2
  5243. pounds 4
  5244. **Missing values**
  5245. >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  5246. ... ('height', 'm')])
  5247. >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
  5248. ... index=['cat', 'dog'],
  5249. ... columns=multicol2)
  5250. It is common to have missing values when stacking a dataframe
  5251. with multi-level columns, as the stacked dataframe typically
  5252. has more values than the original dataframe. Missing values
  5253. are filled with NaNs:
  5254. >>> df_multi_level_cols2
  5255. weight height
  5256. kg m
  5257. cat 1.0 2.0
  5258. dog 3.0 4.0
  5259. >>> df_multi_level_cols2.stack()
  5260. height weight
  5261. cat kg NaN 1.0
  5262. m 2.0 NaN
  5263. dog kg NaN 3.0
  5264. m 4.0 NaN
  5265. **Prescribing the level(s) to be stacked**
  5266. The first parameter controls which level or levels are stacked:
  5267. >>> df_multi_level_cols2.stack(0)
  5268. kg m
  5269. cat height NaN 2.0
  5270. weight 1.0 NaN
  5271. dog height NaN 4.0
  5272. weight 3.0 NaN
  5273. >>> df_multi_level_cols2.stack([0, 1])
  5274. cat height m 2.0
  5275. weight kg 1.0
  5276. dog height m 4.0
  5277. weight kg 3.0
  5278. dtype: float64
  5279. **Dropping missing values**
  5280. >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
  5281. ... index=['cat', 'dog'],
  5282. ... columns=multicol2)
  5283. Note that rows where all values are missing are dropped by
  5284. default but this behaviour can be controlled via the dropna
  5285. keyword parameter:
  5286. >>> df_multi_level_cols3
  5287. weight height
  5288. kg m
  5289. cat NaN 1.0
  5290. dog 2.0 3.0
  5291. >>> df_multi_level_cols3.stack(dropna=False)
  5292. height weight
  5293. cat kg NaN NaN
  5294. m 1.0 NaN
  5295. dog kg NaN 2.0
  5296. m 3.0 NaN
  5297. >>> df_multi_level_cols3.stack(dropna=True)
  5298. height weight
  5299. cat m 1.0 NaN
  5300. dog kg NaN 2.0
  5301. m 3.0 NaN
  5302. """
  5303. from pandas.core.reshape.reshape import stack, stack_multiple
  5304. if isinstance(level, (tuple, list)):
  5305. return stack_multiple(self, level, dropna=dropna)
  5306. else:
  5307. return stack(self, level, dropna=dropna)
  5308. def explode(self, column: Union[str, Tuple]) -> "DataFrame":
  5309. """
  5310. Transform each element of a list-like to a row, replicating index values.
  5311. .. versionadded:: 0.25.0
  5312. Parameters
  5313. ----------
  5314. column : str or tuple
  5315. Column to explode.
  5316. Returns
  5317. -------
  5318. DataFrame
  5319. Exploded lists to rows of the subset columns;
  5320. index will be duplicated for these rows.
  5321. Raises
  5322. ------
  5323. ValueError :
  5324. if columns of the frame are not unique.
  5325. See Also
  5326. --------
  5327. DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
  5328. index labels.
  5329. DataFrame.melt : Unpivot a DataFrame from wide format to long format.
  5330. Series.explode : Explode a DataFrame from list-like columns to long format.
  5331. Notes
  5332. -----
  5333. This routine will explode list-likes including lists, tuples,
  5334. Series, and np.ndarray. The result dtype of the subset rows will
  5335. be object. Scalars will be returned unchanged. Empty list-likes will
  5336. result in a np.nan for that row.
  5337. Examples
  5338. --------
  5339. >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
  5340. >>> df
  5341. A B
  5342. 0 [1, 2, 3] 1
  5343. 1 foo 1
  5344. 2 [] 1
  5345. 3 [3, 4] 1
  5346. >>> df.explode('A')
  5347. A B
  5348. 0 1 1
  5349. 0 2 1
  5350. 0 3 1
  5351. 1 foo 1
  5352. 2 NaN 1
  5353. 3 3 1
  5354. 3 4 1
  5355. """
  5356. if not (is_scalar(column) or isinstance(column, tuple)):
  5357. raise ValueError("column must be a scalar")
  5358. if not self.columns.is_unique:
  5359. raise ValueError("columns must be unique")
  5360. df = self.reset_index(drop=True)
  5361. # TODO: use overload to refine return type of reset_index
  5362. assert df is not None # needed for mypy
  5363. result = df[column].explode()
  5364. result = df.drop([column], axis=1).join(result)
  5365. result.index = self.index.take(result.index)
  5366. result = result.reindex(columns=self.columns, copy=False)
  5367. return result
  5368. def unstack(self, level=-1, fill_value=None):
  5369. """
  5370. Pivot a level of the (necessarily hierarchical) index labels.
  5371. Returns a DataFrame having a new level of column labels whose inner-most level
  5372. consists of the pivoted index labels.
  5373. If the index is not a MultiIndex, the output will be a Series
  5374. (the analogue of stack when the columns are not a MultiIndex).
  5375. The level involved will automatically get sorted.
  5376. Parameters
  5377. ----------
  5378. level : int, str, or list of these, default -1 (last level)
  5379. Level(s) of index to unstack, can pass level name.
  5380. fill_value : int, str or dict
  5381. Replace NaN with this value if the unstack produces missing values.
  5382. Returns
  5383. -------
  5384. Series or DataFrame
  5385. See Also
  5386. --------
  5387. DataFrame.pivot : Pivot a table based on column values.
  5388. DataFrame.stack : Pivot a level of the column labels (inverse operation
  5389. from `unstack`).
  5390. Examples
  5391. --------
  5392. >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
  5393. ... ('two', 'a'), ('two', 'b')])
  5394. >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
  5395. >>> s
  5396. one a 1.0
  5397. b 2.0
  5398. two a 3.0
  5399. b 4.0
  5400. dtype: float64
  5401. >>> s.unstack(level=-1)
  5402. a b
  5403. one 1.0 2.0
  5404. two 3.0 4.0
  5405. >>> s.unstack(level=0)
  5406. one two
  5407. a 1.0 3.0
  5408. b 2.0 4.0
  5409. >>> df = s.unstack(level=0)
  5410. >>> df.unstack()
  5411. one a 1.0
  5412. b 2.0
  5413. two a 3.0
  5414. b 4.0
  5415. dtype: float64
  5416. """
  5417. from pandas.core.reshape.reshape import unstack
  5418. return unstack(self, level, fill_value)
  5419. _shared_docs[
  5420. "melt"
  5421. ] = """
  5422. Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
  5423. This function is useful to massage a DataFrame into a format where one
  5424. or more columns are identifier variables (`id_vars`), while all other
  5425. columns, considered measured variables (`value_vars`), are "unpivoted" to
  5426. the row axis, leaving just two non-identifier columns, 'variable' and
  5427. 'value'.
  5428. %(versionadded)s
  5429. Parameters
  5430. ----------
  5431. id_vars : tuple, list, or ndarray, optional
  5432. Column(s) to use as identifier variables.
  5433. value_vars : tuple, list, or ndarray, optional
  5434. Column(s) to unpivot. If not specified, uses all columns that
  5435. are not set as `id_vars`.
  5436. var_name : scalar
  5437. Name to use for the 'variable' column. If None it uses
  5438. ``frame.columns.name`` or 'variable'.
  5439. value_name : scalar, default 'value'
  5440. Name to use for the 'value' column.
  5441. col_level : int or str, optional
  5442. If columns are a MultiIndex then use this level to melt.
  5443. Returns
  5444. -------
  5445. DataFrame
  5446. Unpivoted DataFrame.
  5447. See Also
  5448. --------
  5449. %(other)s
  5450. pivot_table
  5451. DataFrame.pivot
  5452. Series.explode
  5453. Examples
  5454. --------
  5455. >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
  5456. ... 'B': {0: 1, 1: 3, 2: 5},
  5457. ... 'C': {0: 2, 1: 4, 2: 6}})
  5458. >>> df
  5459. A B C
  5460. 0 a 1 2
  5461. 1 b 3 4
  5462. 2 c 5 6
  5463. >>> %(caller)sid_vars=['A'], value_vars=['B'])
  5464. A variable value
  5465. 0 a B 1
  5466. 1 b B 3
  5467. 2 c B 5
  5468. >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
  5469. A variable value
  5470. 0 a B 1
  5471. 1 b B 3
  5472. 2 c B 5
  5473. 3 a C 2
  5474. 4 b C 4
  5475. 5 c C 6
  5476. The names of 'variable' and 'value' columns can be customized:
  5477. >>> %(caller)sid_vars=['A'], value_vars=['B'],
  5478. ... var_name='myVarname', value_name='myValname')
  5479. A myVarname myValname
  5480. 0 a B 1
  5481. 1 b B 3
  5482. 2 c B 5
  5483. If you have multi-index columns:
  5484. >>> df.columns = [list('ABC'), list('DEF')]
  5485. >>> df
  5486. A B C
  5487. D E F
  5488. 0 a 1 2
  5489. 1 b 3 4
  5490. 2 c 5 6
  5491. >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
  5492. A variable value
  5493. 0 a B 1
  5494. 1 b B 3
  5495. 2 c B 5
  5496. >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
  5497. (A, D) variable_0 variable_1 value
  5498. 0 a B E 1
  5499. 1 b B E 3
  5500. 2 c B E 5
  5501. """
  5502. @Appender(
  5503. _shared_docs["melt"]
  5504. % dict(
  5505. caller="df.melt(", versionadded=".. versionadded:: 0.20.0\n", other="melt"
  5506. )
  5507. )
  5508. def melt(
  5509. self,
  5510. id_vars=None,
  5511. value_vars=None,
  5512. var_name=None,
  5513. value_name="value",
  5514. col_level=None,
  5515. ) -> "DataFrame":
  5516. from pandas.core.reshape.melt import melt
  5517. return melt(
  5518. self,
  5519. id_vars=id_vars,
  5520. value_vars=value_vars,
  5521. var_name=var_name,
  5522. value_name=value_name,
  5523. col_level=col_level,
  5524. )
  5525. # ----------------------------------------------------------------------
  5526. # Time series-related
  5527. def diff(self, periods=1, axis=0) -> "DataFrame":
  5528. """
  5529. First discrete difference of element.
  5530. Calculates the difference of a DataFrame element compared with another
  5531. element in the DataFrame (default is the element in the same column
  5532. of the previous row).
  5533. Parameters
  5534. ----------
  5535. periods : int, default 1
  5536. Periods to shift for calculating difference, accepts negative
  5537. values.
  5538. axis : {0 or 'index', 1 or 'columns'}, default 0
  5539. Take difference over rows (0) or columns (1).
  5540. Returns
  5541. -------
  5542. DataFrame
  5543. See Also
  5544. --------
  5545. Series.diff: First discrete difference for a Series.
  5546. DataFrame.pct_change: Percent change over given number of periods.
  5547. DataFrame.shift: Shift index by desired number of periods with an
  5548. optional time freq.
  5549. Notes
  5550. -----
  5551. For boolean dtypes, this uses :meth:`operator.xor` rather than
  5552. :meth:`operator.sub`.
  5553. Examples
  5554. --------
  5555. Difference with previous row
  5556. >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
  5557. ... 'b': [1, 1, 2, 3, 5, 8],
  5558. ... 'c': [1, 4, 9, 16, 25, 36]})
  5559. >>> df
  5560. a b c
  5561. 0 1 1 1
  5562. 1 2 1 4
  5563. 2 3 2 9
  5564. 3 4 3 16
  5565. 4 5 5 25
  5566. 5 6 8 36
  5567. >>> df.diff()
  5568. a b c
  5569. 0 NaN NaN NaN
  5570. 1 1.0 0.0 3.0
  5571. 2 1.0 1.0 5.0
  5572. 3 1.0 1.0 7.0
  5573. 4 1.0 2.0 9.0
  5574. 5 1.0 3.0 11.0
  5575. Difference with previous column
  5576. >>> df.diff(axis=1)
  5577. a b c
  5578. 0 NaN 0.0 0.0
  5579. 1 NaN -1.0 3.0
  5580. 2 NaN -1.0 7.0
  5581. 3 NaN -1.0 13.0
  5582. 4 NaN 0.0 20.0
  5583. 5 NaN 2.0 28.0
  5584. Difference with 3rd previous row
  5585. >>> df.diff(periods=3)
  5586. a b c
  5587. 0 NaN NaN NaN
  5588. 1 NaN NaN NaN
  5589. 2 NaN NaN NaN
  5590. 3 3.0 2.0 15.0
  5591. 4 3.0 4.0 21.0
  5592. 5 3.0 6.0 27.0
  5593. Difference with following row
  5594. >>> df.diff(periods=-1)
  5595. a b c
  5596. 0 -1.0 0.0 -3.0
  5597. 1 -1.0 -1.0 -5.0
  5598. 2 -1.0 -1.0 -7.0
  5599. 3 -1.0 -2.0 -9.0
  5600. 4 -1.0 -3.0 -11.0
  5601. 5 NaN NaN NaN
  5602. """
  5603. bm_axis = self._get_block_manager_axis(axis)
  5604. new_data = self._data.diff(n=periods, axis=bm_axis)
  5605. return self._constructor(new_data)
  5606. # ----------------------------------------------------------------------
  5607. # Function application
  5608. def _gotitem(
  5609. self,
  5610. key: Union[str, List[str]],
  5611. ndim: int,
  5612. subset: Optional[Union[Series, ABCDataFrame]] = None,
  5613. ) -> Union[Series, ABCDataFrame]:
  5614. """
  5615. Sub-classes to define. Return a sliced object.
  5616. Parameters
  5617. ----------
  5618. key : string / list of selections
  5619. ndim : 1,2
  5620. requested ndim of result
  5621. subset : object, default None
  5622. subset to act on
  5623. """
  5624. if subset is None:
  5625. subset = self
  5626. elif subset.ndim == 1: # is Series
  5627. return subset
  5628. # TODO: _shallow_copy(subset)?
  5629. return subset[key]
  5630. _agg_summary_and_see_also_doc = dedent(
  5631. """
  5632. The aggregation operations are always performed over an axis, either the
  5633. index (default) or the column axis. This behavior is different from
  5634. `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
  5635. `var`), where the default is to compute the aggregation of the flattened
  5636. array, e.g., ``numpy.mean(arr_2d)`` as opposed to
  5637. ``numpy.mean(arr_2d, axis=0)``.
  5638. `agg` is an alias for `aggregate`. Use the alias.
  5639. See Also
  5640. --------
  5641. DataFrame.apply : Perform any type of operations.
  5642. DataFrame.transform : Perform transformation type operations.
  5643. core.groupby.GroupBy : Perform operations over groups.
  5644. core.resample.Resampler : Perform operations over resampled bins.
  5645. core.window.Rolling : Perform operations over rolling window.
  5646. core.window.Expanding : Perform operations over expanding window.
  5647. core.window.EWM : Perform operation over exponential weighted
  5648. window.
  5649. """
  5650. )
  5651. _agg_examples_doc = dedent(
  5652. """
  5653. Examples
  5654. --------
  5655. >>> df = pd.DataFrame([[1, 2, 3],
  5656. ... [4, 5, 6],
  5657. ... [7, 8, 9],
  5658. ... [np.nan, np.nan, np.nan]],
  5659. ... columns=['A', 'B', 'C'])
  5660. Aggregate these functions over the rows.
  5661. >>> df.agg(['sum', 'min'])
  5662. A B C
  5663. sum 12.0 15.0 18.0
  5664. min 1.0 2.0 3.0
  5665. Different aggregations per column.
  5666. >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
  5667. A B
  5668. max NaN 8.0
  5669. min 1.0 2.0
  5670. sum 12.0 NaN
  5671. Aggregate over the columns.
  5672. >>> df.agg("mean", axis="columns")
  5673. 0 2.0
  5674. 1 5.0
  5675. 2 8.0
  5676. 3 NaN
  5677. dtype: float64
  5678. """
  5679. )
  5680. @Substitution(
  5681. see_also=_agg_summary_and_see_also_doc,
  5682. examples=_agg_examples_doc,
  5683. versionadded="\n.. versionadded:: 0.20.0\n",
  5684. **_shared_doc_kwargs,
  5685. )
  5686. @Appender(_shared_docs["aggregate"])
  5687. def aggregate(self, func, axis=0, *args, **kwargs):
  5688. axis = self._get_axis_number(axis)
  5689. result = None
  5690. try:
  5691. result, how = self._aggregate(func, axis=axis, *args, **kwargs)
  5692. except TypeError:
  5693. pass
  5694. if result is None:
  5695. return self.apply(func, axis=axis, args=args, **kwargs)
  5696. return result
  5697. def _aggregate(self, arg, axis=0, *args, **kwargs):
  5698. if axis == 1:
  5699. # NDFrame.aggregate returns a tuple, and we need to transpose
  5700. # only result
  5701. result, how = self.T._aggregate(arg, *args, **kwargs)
  5702. result = result.T if result is not None else result
  5703. return result, how
  5704. return super()._aggregate(arg, *args, **kwargs)
  5705. agg = aggregate
  5706. @Appender(_shared_docs["transform"] % _shared_doc_kwargs)
  5707. def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame":
  5708. axis = self._get_axis_number(axis)
  5709. if axis == 1:
  5710. return self.T.transform(func, *args, **kwargs).T
  5711. return super().transform(func, *args, **kwargs)
  5712. def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds):
  5713. """
  5714. Apply a function along an axis of the DataFrame.
  5715. Objects passed to the function are Series objects whose index is
  5716. either the DataFrame's index (``axis=0``) or the DataFrame's columns
  5717. (``axis=1``). By default (``result_type=None``), the final return type
  5718. is inferred from the return type of the applied function. Otherwise,
  5719. it depends on the `result_type` argument.
  5720. Parameters
  5721. ----------
  5722. func : function
  5723. Function to apply to each column or row.
  5724. axis : {0 or 'index', 1 or 'columns'}, default 0
  5725. Axis along which the function is applied:
  5726. * 0 or 'index': apply function to each column.
  5727. * 1 or 'columns': apply function to each row.
  5728. raw : bool, default False
  5729. Determines if row or column is passed as a Series or ndarray object:
  5730. * ``False`` : passes each row or column as a Series to the
  5731. function.
  5732. * ``True`` : the passed function will receive ndarray objects
  5733. instead.
  5734. If you are just applying a NumPy reduction function this will
  5735. achieve much better performance.
  5736. result_type : {'expand', 'reduce', 'broadcast', None}, default None
  5737. These only act when ``axis=1`` (columns):
  5738. * 'expand' : list-like results will be turned into columns.
  5739. * 'reduce' : returns a Series if possible rather than expanding
  5740. list-like results. This is the opposite of 'expand'.
  5741. * 'broadcast' : results will be broadcast to the original shape
  5742. of the DataFrame, the original index and columns will be
  5743. retained.
  5744. The default behaviour (None) depends on the return value of the
  5745. applied function: list-like results will be returned as a Series
  5746. of those. However if the apply function returns a Series these
  5747. are expanded to columns.
  5748. .. versionadded:: 0.23.0
  5749. args : tuple
  5750. Positional arguments to pass to `func` in addition to the
  5751. array/series.
  5752. **kwds
  5753. Additional keyword arguments to pass as keywords arguments to
  5754. `func`.
  5755. Returns
  5756. -------
  5757. Series or DataFrame
  5758. Result of applying ``func`` along the given axis of the
  5759. DataFrame.
  5760. See Also
  5761. --------
  5762. DataFrame.applymap: For elementwise operations.
  5763. DataFrame.aggregate: Only perform aggregating type operations.
  5764. DataFrame.transform: Only perform transforming type operations.
  5765. Examples
  5766. --------
  5767. >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
  5768. >>> df
  5769. A B
  5770. 0 4 9
  5771. 1 4 9
  5772. 2 4 9
  5773. Using a numpy universal function (in this case the same as
  5774. ``np.sqrt(df)``):
  5775. >>> df.apply(np.sqrt)
  5776. A B
  5777. 0 2.0 3.0
  5778. 1 2.0 3.0
  5779. 2 2.0 3.0
  5780. Using a reducing function on either axis
  5781. >>> df.apply(np.sum, axis=0)
  5782. A 12
  5783. B 27
  5784. dtype: int64
  5785. >>> df.apply(np.sum, axis=1)
  5786. 0 13
  5787. 1 13
  5788. 2 13
  5789. dtype: int64
  5790. Returning a list-like will result in a Series
  5791. >>> df.apply(lambda x: [1, 2], axis=1)
  5792. 0 [1, 2]
  5793. 1 [1, 2]
  5794. 2 [1, 2]
  5795. dtype: object
  5796. Passing result_type='expand' will expand list-like results
  5797. to columns of a Dataframe
  5798. >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
  5799. 0 1
  5800. 0 1 2
  5801. 1 1 2
  5802. 2 1 2
  5803. Returning a Series inside the function is similar to passing
  5804. ``result_type='expand'``. The resulting column names
  5805. will be the Series index.
  5806. >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
  5807. foo bar
  5808. 0 1 2
  5809. 1 1 2
  5810. 2 1 2
  5811. Passing ``result_type='broadcast'`` will ensure the same shape
  5812. result, whether list-like or scalar is returned by the function,
  5813. and broadcast it along the axis. The resulting column names will
  5814. be the originals.
  5815. >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
  5816. A B
  5817. 0 1 2
  5818. 1 1 2
  5819. 2 1 2
  5820. """
  5821. from pandas.core.apply import frame_apply
  5822. op = frame_apply(
  5823. self,
  5824. func=func,
  5825. axis=axis,
  5826. raw=raw,
  5827. result_type=result_type,
  5828. args=args,
  5829. kwds=kwds,
  5830. )
  5831. return op.get_result()
  5832. def applymap(self, func) -> "DataFrame":
  5833. """
  5834. Apply a function to a Dataframe elementwise.
  5835. This method applies a function that accepts and returns a scalar
  5836. to every element of a DataFrame.
  5837. Parameters
  5838. ----------
  5839. func : callable
  5840. Python function, returns a single value from a single value.
  5841. Returns
  5842. -------
  5843. DataFrame
  5844. Transformed DataFrame.
  5845. See Also
  5846. --------
  5847. DataFrame.apply : Apply a function along input axis of DataFrame.
  5848. Notes
  5849. -----
  5850. In the current implementation applymap calls `func` twice on the
  5851. first column/row to decide whether it can take a fast or slow
  5852. code path. This can lead to unexpected behavior if `func` has
  5853. side-effects, as they will take effect twice for the first
  5854. column/row.
  5855. Examples
  5856. --------
  5857. >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
  5858. >>> df
  5859. 0 1
  5860. 0 1.000 2.120
  5861. 1 3.356 4.567
  5862. >>> df.applymap(lambda x: len(str(x)))
  5863. 0 1
  5864. 0 3 4
  5865. 1 5 5
  5866. Note that a vectorized version of `func` often exists, which will
  5867. be much faster. You could square each number elementwise.
  5868. >>> df.applymap(lambda x: x**2)
  5869. 0 1
  5870. 0 1.000000 4.494400
  5871. 1 11.262736 20.857489
  5872. But it's better to avoid applymap in that case.
  5873. >>> df ** 2
  5874. 0 1
  5875. 0 1.000000 4.494400
  5876. 1 11.262736 20.857489
  5877. """
  5878. # if we have a dtype == 'M8[ns]', provide boxed values
  5879. def infer(x):
  5880. if x.empty:
  5881. return lib.map_infer(x, func)
  5882. return lib.map_infer(x.astype(object).values, func)
  5883. return self.apply(infer)
  5884. # ----------------------------------------------------------------------
  5885. # Merging / joining methods
  5886. def append(
  5887. self, other, ignore_index=False, verify_integrity=False, sort=False
  5888. ) -> "DataFrame":
  5889. """
  5890. Append rows of `other` to the end of caller, returning a new object.
  5891. Columns in `other` that are not in the caller are added as new columns.
  5892. Parameters
  5893. ----------
  5894. other : DataFrame or Series/dict-like object, or list of these
  5895. The data to append.
  5896. ignore_index : bool, default False
  5897. If True, do not use the index labels.
  5898. verify_integrity : bool, default False
  5899. If True, raise ValueError on creating index with duplicates.
  5900. sort : bool, default False
  5901. Sort columns if the columns of `self` and `other` are not aligned.
  5902. .. versionadded:: 0.23.0
  5903. .. versionchanged:: 1.0.0
  5904. Changed to not sort by default.
  5905. Returns
  5906. -------
  5907. DataFrame
  5908. See Also
  5909. --------
  5910. concat : General function to concatenate DataFrame or Series objects.
  5911. Notes
  5912. -----
  5913. If a list of dict/series is passed and the keys are all contained in
  5914. the DataFrame's index, the order of the columns in the resulting
  5915. DataFrame will be unchanged.
  5916. Iteratively appending rows to a DataFrame can be more computationally
  5917. intensive than a single concatenate. A better solution is to append
  5918. those rows to a list and then concatenate the list with the original
  5919. DataFrame all at once.
  5920. Examples
  5921. --------
  5922. >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
  5923. >>> df
  5924. A B
  5925. 0 1 2
  5926. 1 3 4
  5927. >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
  5928. >>> df.append(df2)
  5929. A B
  5930. 0 1 2
  5931. 1 3 4
  5932. 0 5 6
  5933. 1 7 8
  5934. With `ignore_index` set to True:
  5935. >>> df.append(df2, ignore_index=True)
  5936. A B
  5937. 0 1 2
  5938. 1 3 4
  5939. 2 5 6
  5940. 3 7 8
  5941. The following, while not recommended methods for generating DataFrames,
  5942. show two ways to generate a DataFrame from multiple data sources.
  5943. Less efficient:
  5944. >>> df = pd.DataFrame(columns=['A'])
  5945. >>> for i in range(5):
  5946. ... df = df.append({'A': i}, ignore_index=True)
  5947. >>> df
  5948. A
  5949. 0 0
  5950. 1 1
  5951. 2 2
  5952. 3 3
  5953. 4 4
  5954. More efficient:
  5955. >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
  5956. ... ignore_index=True)
  5957. A
  5958. 0 0
  5959. 1 1
  5960. 2 2
  5961. 3 3
  5962. 4 4
  5963. """
  5964. if isinstance(other, (Series, dict)):
  5965. if isinstance(other, dict):
  5966. other = Series(other)
  5967. if other.name is None and not ignore_index:
  5968. raise TypeError(
  5969. "Can only append a Series if ignore_index=True "
  5970. "or if the Series has a name"
  5971. )
  5972. index = Index([other.name], name=self.index.name)
  5973. idx_diff = other.index.difference(self.columns)
  5974. try:
  5975. combined_columns = self.columns.append(idx_diff)
  5976. except TypeError:
  5977. combined_columns = self.columns.astype(object).append(idx_diff)
  5978. other = (
  5979. other.reindex(combined_columns, copy=False)
  5980. .to_frame()
  5981. .T.infer_objects()
  5982. .rename_axis(index.names, copy=False)
  5983. )
  5984. if not self.columns.equals(combined_columns):
  5985. self = self.reindex(columns=combined_columns)
  5986. elif isinstance(other, list):
  5987. if not other:
  5988. pass
  5989. elif not isinstance(other[0], DataFrame):
  5990. other = DataFrame(other)
  5991. if (self.columns.get_indexer(other.columns) >= 0).all():
  5992. other = other.reindex(columns=self.columns)
  5993. from pandas.core.reshape.concat import concat
  5994. if isinstance(other, (list, tuple)):
  5995. to_concat = [self, *other]
  5996. else:
  5997. to_concat = [self, other]
  5998. return concat(
  5999. to_concat,
  6000. ignore_index=ignore_index,
  6001. verify_integrity=verify_integrity,
  6002. sort=sort,
  6003. )
  6004. def join(
  6005. self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
  6006. ) -> "DataFrame":
  6007. """
  6008. Join columns of another DataFrame.
  6009. Join columns with `other` DataFrame either on index or on a key
  6010. column. Efficiently join multiple DataFrame objects by index at once by
  6011. passing a list.
  6012. Parameters
  6013. ----------
  6014. other : DataFrame, Series, or list of DataFrame
  6015. Index should be similar to one of the columns in this one. If a
  6016. Series is passed, its name attribute must be set, and that will be
  6017. used as the column name in the resulting joined DataFrame.
  6018. on : str, list of str, or array-like, optional
  6019. Column or index level name(s) in the caller to join on the index
  6020. in `other`, otherwise joins index-on-index. If multiple
  6021. values given, the `other` DataFrame must have a MultiIndex. Can
  6022. pass an array as the join key if it is not already contained in
  6023. the calling DataFrame. Like an Excel VLOOKUP operation.
  6024. how : {'left', 'right', 'outer', 'inner'}, default 'left'
  6025. How to handle the operation of the two objects.
  6026. * left: use calling frame's index (or column if on is specified)
  6027. * right: use `other`'s index.
  6028. * outer: form union of calling frame's index (or column if on is
  6029. specified) with `other`'s index, and sort it.
  6030. lexicographically.
  6031. * inner: form intersection of calling frame's index (or column if
  6032. on is specified) with `other`'s index, preserving the order
  6033. of the calling's one.
  6034. lsuffix : str, default ''
  6035. Suffix to use from left frame's overlapping columns.
  6036. rsuffix : str, default ''
  6037. Suffix to use from right frame's overlapping columns.
  6038. sort : bool, default False
  6039. Order result DataFrame lexicographically by the join key. If False,
  6040. the order of the join key depends on the join type (how keyword).
  6041. Returns
  6042. -------
  6043. DataFrame
  6044. A dataframe containing columns from both the caller and `other`.
  6045. See Also
  6046. --------
  6047. DataFrame.merge : For column(s)-on-columns(s) operations.
  6048. Notes
  6049. -----
  6050. Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
  6051. passing a list of `DataFrame` objects.
  6052. Support for specifying index levels as the `on` parameter was added
  6053. in version 0.23.0.
  6054. Examples
  6055. --------
  6056. >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
  6057. ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
  6058. >>> df
  6059. key A
  6060. 0 K0 A0
  6061. 1 K1 A1
  6062. 2 K2 A2
  6063. 3 K3 A3
  6064. 4 K4 A4
  6065. 5 K5 A5
  6066. >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
  6067. ... 'B': ['B0', 'B1', 'B2']})
  6068. >>> other
  6069. key B
  6070. 0 K0 B0
  6071. 1 K1 B1
  6072. 2 K2 B2
  6073. Join DataFrames using their indexes.
  6074. >>> df.join(other, lsuffix='_caller', rsuffix='_other')
  6075. key_caller A key_other B
  6076. 0 K0 A0 K0 B0
  6077. 1 K1 A1 K1 B1
  6078. 2 K2 A2 K2 B2
  6079. 3 K3 A3 NaN NaN
  6080. 4 K4 A4 NaN NaN
  6081. 5 K5 A5 NaN NaN
  6082. If we want to join using the key columns, we need to set key to be
  6083. the index in both `df` and `other`. The joined DataFrame will have
  6084. key as its index.
  6085. >>> df.set_index('key').join(other.set_index('key'))
  6086. A B
  6087. key
  6088. K0 A0 B0
  6089. K1 A1 B1
  6090. K2 A2 B2
  6091. K3 A3 NaN
  6092. K4 A4 NaN
  6093. K5 A5 NaN
  6094. Another option to join using the key columns is to use the `on`
  6095. parameter. DataFrame.join always uses `other`'s index but we can use
  6096. any column in `df`. This method preserves the original DataFrame's
  6097. index in the result.
  6098. >>> df.join(other.set_index('key'), on='key')
  6099. key A B
  6100. 0 K0 A0 B0
  6101. 1 K1 A1 B1
  6102. 2 K2 A2 B2
  6103. 3 K3 A3 NaN
  6104. 4 K4 A4 NaN
  6105. 5 K5 A5 NaN
  6106. """
  6107. return self._join_compat(
  6108. other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort
  6109. )
  6110. def _join_compat(
  6111. self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False
  6112. ):
  6113. from pandas.core.reshape.merge import merge
  6114. from pandas.core.reshape.concat import concat
  6115. if isinstance(other, Series):
  6116. if other.name is None:
  6117. raise ValueError("Other Series must have a name")
  6118. other = DataFrame({other.name: other})
  6119. if isinstance(other, DataFrame):
  6120. return merge(
  6121. self,
  6122. other,
  6123. left_on=on,
  6124. how=how,
  6125. left_index=on is None,
  6126. right_index=True,
  6127. suffixes=(lsuffix, rsuffix),
  6128. sort=sort,
  6129. )
  6130. else:
  6131. if on is not None:
  6132. raise ValueError(
  6133. "Joining multiple DataFrames only supported for joining on index"
  6134. )
  6135. frames = [self] + list(other)
  6136. can_concat = all(df.index.is_unique for df in frames)
  6137. # join indexes only using concat
  6138. if can_concat:
  6139. if how == "left":
  6140. res = concat(
  6141. frames, axis=1, join="outer", verify_integrity=True, sort=sort
  6142. )
  6143. return res.reindex(self.index, copy=False)
  6144. else:
  6145. return concat(
  6146. frames, axis=1, join=how, verify_integrity=True, sort=sort
  6147. )
  6148. joined = frames[0]
  6149. for frame in frames[1:]:
  6150. joined = merge(
  6151. joined, frame, how=how, left_index=True, right_index=True
  6152. )
  6153. return joined
  6154. @Substitution("")
  6155. @Appender(_merge_doc, indents=2)
  6156. def merge(
  6157. self,
  6158. right,
  6159. how="inner",
  6160. on=None,
  6161. left_on=None,
  6162. right_on=None,
  6163. left_index=False,
  6164. right_index=False,
  6165. sort=False,
  6166. suffixes=("_x", "_y"),
  6167. copy=True,
  6168. indicator=False,
  6169. validate=None,
  6170. ) -> "DataFrame":
  6171. from pandas.core.reshape.merge import merge
  6172. return merge(
  6173. self,
  6174. right,
  6175. how=how,
  6176. on=on,
  6177. left_on=left_on,
  6178. right_on=right_on,
  6179. left_index=left_index,
  6180. right_index=right_index,
  6181. sort=sort,
  6182. suffixes=suffixes,
  6183. copy=copy,
  6184. indicator=indicator,
  6185. validate=validate,
  6186. )
  6187. def round(self, decimals=0, *args, **kwargs) -> "DataFrame":
  6188. """
  6189. Round a DataFrame to a variable number of decimal places.
  6190. Parameters
  6191. ----------
  6192. decimals : int, dict, Series
  6193. Number of decimal places to round each column to. If an int is
  6194. given, round each column to the same number of places.
  6195. Otherwise dict and Series round to variable numbers of places.
  6196. Column names should be in the keys if `decimals` is a
  6197. dict-like, or in the index if `decimals` is a Series. Any
  6198. columns not included in `decimals` will be left as is. Elements
  6199. of `decimals` which are not columns of the input will be
  6200. ignored.
  6201. *args
  6202. Additional keywords have no effect but might be accepted for
  6203. compatibility with numpy.
  6204. **kwargs
  6205. Additional keywords have no effect but might be accepted for
  6206. compatibility with numpy.
  6207. Returns
  6208. -------
  6209. DataFrame
  6210. A DataFrame with the affected columns rounded to the specified
  6211. number of decimal places.
  6212. See Also
  6213. --------
  6214. numpy.around : Round a numpy array to the given number of decimals.
  6215. Series.round : Round a Series to the given number of decimals.
  6216. Examples
  6217. --------
  6218. >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
  6219. ... columns=['dogs', 'cats'])
  6220. >>> df
  6221. dogs cats
  6222. 0 0.21 0.32
  6223. 1 0.01 0.67
  6224. 2 0.66 0.03
  6225. 3 0.21 0.18
  6226. By providing an integer each column is rounded to the same number
  6227. of decimal places
  6228. >>> df.round(1)
  6229. dogs cats
  6230. 0 0.2 0.3
  6231. 1 0.0 0.7
  6232. 2 0.7 0.0
  6233. 3 0.2 0.2
  6234. With a dict, the number of places for specific columns can be
  6235. specified with the column names as key and the number of decimal
  6236. places as value
  6237. >>> df.round({'dogs': 1, 'cats': 0})
  6238. dogs cats
  6239. 0 0.2 0.0
  6240. 1 0.0 1.0
  6241. 2 0.7 0.0
  6242. 3 0.2 0.0
  6243. Using a Series, the number of places for specific columns can be
  6244. specified with the column names as index and the number of
  6245. decimal places as value
  6246. >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
  6247. >>> df.round(decimals)
  6248. dogs cats
  6249. 0 0.2 0.0
  6250. 1 0.0 1.0
  6251. 2 0.7 0.0
  6252. 3 0.2 0.0
  6253. """
  6254. from pandas.core.reshape.concat import concat
  6255. def _dict_round(df, decimals):
  6256. for col, vals in df.items():
  6257. try:
  6258. yield _series_round(vals, decimals[col])
  6259. except KeyError:
  6260. yield vals
  6261. def _series_round(s, decimals):
  6262. if is_integer_dtype(s) or is_float_dtype(s):
  6263. return s.round(decimals)
  6264. return s
  6265. nv.validate_round(args, kwargs)
  6266. if isinstance(decimals, (dict, Series)):
  6267. if isinstance(decimals, Series):
  6268. if not decimals.index.is_unique:
  6269. raise ValueError("Index of decimals must be unique")
  6270. new_cols = list(_dict_round(self, decimals))
  6271. elif is_integer(decimals):
  6272. # Dispatch to Series.round
  6273. new_cols = [_series_round(v, decimals) for _, v in self.items()]
  6274. else:
  6275. raise TypeError("decimals must be an integer, a dict-like or a Series")
  6276. if len(new_cols) > 0:
  6277. return self._constructor(
  6278. concat(new_cols, axis=1), index=self.index, columns=self.columns
  6279. )
  6280. else:
  6281. return self
  6282. # ----------------------------------------------------------------------
  6283. # Statistical methods, etc.
  6284. def corr(self, method="pearson", min_periods=1) -> "DataFrame":
  6285. """
  6286. Compute pairwise correlation of columns, excluding NA/null values.
  6287. Parameters
  6288. ----------
  6289. method : {'pearson', 'kendall', 'spearman'} or callable
  6290. Method of correlation:
  6291. * pearson : standard correlation coefficient
  6292. * kendall : Kendall Tau correlation coefficient
  6293. * spearman : Spearman rank correlation
  6294. * callable: callable with input two 1d ndarrays
  6295. and returning a float. Note that the returned matrix from corr
  6296. will have 1 along the diagonals and will be symmetric
  6297. regardless of the callable's behavior.
  6298. .. versionadded:: 0.24.0
  6299. min_periods : int, optional
  6300. Minimum number of observations required per pair of columns
  6301. to have a valid result. Currently only available for Pearson
  6302. and Spearman correlation.
  6303. Returns
  6304. -------
  6305. DataFrame
  6306. Correlation matrix.
  6307. See Also
  6308. --------
  6309. DataFrame.corrwith
  6310. Series.corr
  6311. Examples
  6312. --------
  6313. >>> def histogram_intersection(a, b):
  6314. ... v = np.minimum(a, b).sum().round(decimals=1)
  6315. ... return v
  6316. >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
  6317. ... columns=['dogs', 'cats'])
  6318. >>> df.corr(method=histogram_intersection)
  6319. dogs cats
  6320. dogs 1.0 0.3
  6321. cats 0.3 1.0
  6322. """
  6323. numeric_df = self._get_numeric_data()
  6324. cols = numeric_df.columns
  6325. idx = cols.copy()
  6326. mat = numeric_df.values
  6327. if method == "pearson":
  6328. correl = libalgos.nancorr(ensure_float64(mat), minp=min_periods)
  6329. elif method == "spearman":
  6330. correl = libalgos.nancorr_spearman(ensure_float64(mat), minp=min_periods)
  6331. elif method == "kendall" or callable(method):
  6332. if min_periods is None:
  6333. min_periods = 1
  6334. mat = ensure_float64(mat).T
  6335. corrf = nanops.get_corr_func(method)
  6336. K = len(cols)
  6337. correl = np.empty((K, K), dtype=float)
  6338. mask = np.isfinite(mat)
  6339. for i, ac in enumerate(mat):
  6340. for j, bc in enumerate(mat):
  6341. if i > j:
  6342. continue
  6343. valid = mask[i] & mask[j]
  6344. if valid.sum() < min_periods:
  6345. c = np.nan
  6346. elif i == j:
  6347. c = 1.0
  6348. elif not valid.all():
  6349. c = corrf(ac[valid], bc[valid])
  6350. else:
  6351. c = corrf(ac, bc)
  6352. correl[i, j] = c
  6353. correl[j, i] = c
  6354. else:
  6355. raise ValueError(
  6356. "method must be either 'pearson', "
  6357. "'spearman', 'kendall', or a callable, "
  6358. f"'{method}' was supplied"
  6359. )
  6360. return self._constructor(correl, index=idx, columns=cols)
  6361. def cov(self, min_periods=None) -> "DataFrame":
  6362. """
  6363. Compute pairwise covariance of columns, excluding NA/null values.
  6364. Compute the pairwise covariance among the series of a DataFrame.
  6365. The returned data frame is the `covariance matrix
  6366. <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
  6367. of the DataFrame.
  6368. Both NA and null values are automatically excluded from the
  6369. calculation. (See the note below about bias from missing values.)
  6370. A threshold can be set for the minimum number of
  6371. observations for each value created. Comparisons with observations
  6372. below this threshold will be returned as ``NaN``.
  6373. This method is generally used for the analysis of time series data to
  6374. understand the relationship between different measures
  6375. across time.
  6376. Parameters
  6377. ----------
  6378. min_periods : int, optional
  6379. Minimum number of observations required per pair of columns
  6380. to have a valid result.
  6381. Returns
  6382. -------
  6383. DataFrame
  6384. The covariance matrix of the series of the DataFrame.
  6385. See Also
  6386. --------
  6387. Series.cov : Compute covariance with another Series.
  6388. core.window.EWM.cov: Exponential weighted sample covariance.
  6389. core.window.Expanding.cov : Expanding sample covariance.
  6390. core.window.Rolling.cov : Rolling sample covariance.
  6391. Notes
  6392. -----
  6393. Returns the covariance matrix of the DataFrame's time series.
  6394. The covariance is normalized by N-1.
  6395. For DataFrames that have Series that are missing data (assuming that
  6396. data is `missing at random
  6397. <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
  6398. the returned covariance matrix will be an unbiased estimate
  6399. of the variance and covariance between the member Series.
  6400. However, for many applications this estimate may not be acceptable
  6401. because the estimate covariance matrix is not guaranteed to be positive
  6402. semi-definite. This could lead to estimate correlations having
  6403. absolute values which are greater than one, and/or a non-invertible
  6404. covariance matrix. See `Estimation of covariance matrices
  6405. <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
  6406. matrices>`__ for more details.
  6407. Examples
  6408. --------
  6409. >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
  6410. ... columns=['dogs', 'cats'])
  6411. >>> df.cov()
  6412. dogs cats
  6413. dogs 0.666667 -1.000000
  6414. cats -1.000000 1.666667
  6415. >>> np.random.seed(42)
  6416. >>> df = pd.DataFrame(np.random.randn(1000, 5),
  6417. ... columns=['a', 'b', 'c', 'd', 'e'])
  6418. >>> df.cov()
  6419. a b c d e
  6420. a 0.998438 -0.020161 0.059277 -0.008943 0.014144
  6421. b -0.020161 1.059352 -0.008543 -0.024738 0.009826
  6422. c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
  6423. d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
  6424. e 0.014144 0.009826 -0.000271 -0.013692 0.977795
  6425. **Minimum number of periods**
  6426. This method also supports an optional ``min_periods`` keyword
  6427. that specifies the required minimum number of non-NA observations for
  6428. each column pair in order to have a valid result:
  6429. >>> np.random.seed(42)
  6430. >>> df = pd.DataFrame(np.random.randn(20, 3),
  6431. ... columns=['a', 'b', 'c'])
  6432. >>> df.loc[df.index[:5], 'a'] = np.nan
  6433. >>> df.loc[df.index[5:10], 'b'] = np.nan
  6434. >>> df.cov(min_periods=12)
  6435. a b c
  6436. a 0.316741 NaN -0.150812
  6437. b NaN 1.248003 0.191417
  6438. c -0.150812 0.191417 0.895202
  6439. """
  6440. numeric_df = self._get_numeric_data()
  6441. cols = numeric_df.columns
  6442. idx = cols.copy()
  6443. mat = numeric_df.values
  6444. if notna(mat).all():
  6445. if min_periods is not None and min_periods > len(mat):
  6446. baseCov = np.empty((mat.shape[1], mat.shape[1]))
  6447. baseCov.fill(np.nan)
  6448. else:
  6449. baseCov = np.cov(mat.T)
  6450. baseCov = baseCov.reshape((len(cols), len(cols)))
  6451. else:
  6452. baseCov = libalgos.nancorr(ensure_float64(mat), cov=True, minp=min_periods)
  6453. return self._constructor(baseCov, index=idx, columns=cols)
  6454. def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series:
  6455. """
  6456. Compute pairwise correlation.
  6457. Pairwise correlation is computed between rows or columns of
  6458. DataFrame with rows or columns of Series or DataFrame. DataFrames
  6459. are first aligned along both axes before computing the
  6460. correlations.
  6461. Parameters
  6462. ----------
  6463. other : DataFrame, Series
  6464. Object with which to compute correlations.
  6465. axis : {0 or 'index', 1 or 'columns'}, default 0
  6466. The axis to use. 0 or 'index' to compute column-wise, 1 or 'columns' for
  6467. row-wise.
  6468. drop : bool, default False
  6469. Drop missing indices from result.
  6470. method : {'pearson', 'kendall', 'spearman'} or callable
  6471. Method of correlation:
  6472. * pearson : standard correlation coefficient
  6473. * kendall : Kendall Tau correlation coefficient
  6474. * spearman : Spearman rank correlation
  6475. * callable: callable with input two 1d ndarrays
  6476. and returning a float.
  6477. .. versionadded:: 0.24.0
  6478. Returns
  6479. -------
  6480. Series
  6481. Pairwise correlations.
  6482. See Also
  6483. --------
  6484. DataFrame.corr
  6485. """
  6486. axis = self._get_axis_number(axis)
  6487. this = self._get_numeric_data()
  6488. if isinstance(other, Series):
  6489. return this.apply(lambda x: other.corr(x, method=method), axis=axis)
  6490. other = other._get_numeric_data()
  6491. left, right = this.align(other, join="inner", copy=False)
  6492. if axis == 1:
  6493. left = left.T
  6494. right = right.T
  6495. if method == "pearson":
  6496. # mask missing values
  6497. left = left + right * 0
  6498. right = right + left * 0
  6499. # demeaned data
  6500. ldem = left - left.mean()
  6501. rdem = right - right.mean()
  6502. num = (ldem * rdem).sum()
  6503. dom = (left.count() - 1) * left.std() * right.std()
  6504. correl = num / dom
  6505. elif method in ["kendall", "spearman"] or callable(method):
  6506. def c(x):
  6507. return nanops.nancorr(x[0], x[1], method=method)
  6508. correl = Series(
  6509. map(c, zip(left.values.T, right.values.T)), index=left.columns
  6510. )
  6511. else:
  6512. raise ValueError(
  6513. f"Invalid method {method} was passed, "
  6514. "valid methods are: 'pearson', 'kendall', "
  6515. "'spearman', or callable"
  6516. )
  6517. if not drop:
  6518. # Find non-matching labels along the given axis
  6519. # and append missing correlations (GH 22375)
  6520. raxis = 1 if axis == 0 else 0
  6521. result_index = this._get_axis(raxis).union(other._get_axis(raxis))
  6522. idx_diff = result_index.difference(correl.index)
  6523. if len(idx_diff) > 0:
  6524. correl = correl.append(Series([np.nan] * len(idx_diff), index=idx_diff))
  6525. return correl
  6526. # ----------------------------------------------------------------------
  6527. # ndarray-like stats methods
  6528. def count(self, axis=0, level=None, numeric_only=False):
  6529. """
  6530. Count non-NA cells for each column or row.
  6531. The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
  6532. on `pandas.options.mode.use_inf_as_na`) are considered NA.
  6533. Parameters
  6534. ----------
  6535. axis : {0 or 'index', 1 or 'columns'}, default 0
  6536. If 0 or 'index' counts are generated for each column.
  6537. If 1 or 'columns' counts are generated for each **row**.
  6538. level : int or str, optional
  6539. If the axis is a `MultiIndex` (hierarchical), count along a
  6540. particular `level`, collapsing into a `DataFrame`.
  6541. A `str` specifies the level name.
  6542. numeric_only : bool, default False
  6543. Include only `float`, `int` or `boolean` data.
  6544. Returns
  6545. -------
  6546. Series or DataFrame
  6547. For each column/row the number of non-NA/null entries.
  6548. If `level` is specified returns a `DataFrame`.
  6549. See Also
  6550. --------
  6551. Series.count: Number of non-NA elements in a Series.
  6552. DataFrame.shape: Number of DataFrame rows and columns (including NA
  6553. elements).
  6554. DataFrame.isna: Boolean same-sized DataFrame showing places of NA
  6555. elements.
  6556. Examples
  6557. --------
  6558. Constructing DataFrame from a dictionary:
  6559. >>> df = pd.DataFrame({"Person":
  6560. ... ["John", "Myla", "Lewis", "John", "Myla"],
  6561. ... "Age": [24., np.nan, 21., 33, 26],
  6562. ... "Single": [False, True, True, True, False]})
  6563. >>> df
  6564. Person Age Single
  6565. 0 John 24.0 False
  6566. 1 Myla NaN True
  6567. 2 Lewis 21.0 True
  6568. 3 John 33.0 True
  6569. 4 Myla 26.0 False
  6570. Notice the uncounted NA values:
  6571. >>> df.count()
  6572. Person 5
  6573. Age 4
  6574. Single 5
  6575. dtype: int64
  6576. Counts for each **row**:
  6577. >>> df.count(axis='columns')
  6578. 0 3
  6579. 1 2
  6580. 2 3
  6581. 3 3
  6582. 4 3
  6583. dtype: int64
  6584. Counts for one level of a `MultiIndex`:
  6585. >>> df.set_index(["Person", "Single"]).count(level="Person")
  6586. Age
  6587. Person
  6588. John 2
  6589. Lewis 1
  6590. Myla 1
  6591. """
  6592. axis = self._get_axis_number(axis)
  6593. if level is not None:
  6594. return self._count_level(level, axis=axis, numeric_only=numeric_only)
  6595. if numeric_only:
  6596. frame = self._get_numeric_data()
  6597. else:
  6598. frame = self
  6599. # GH #423
  6600. if len(frame._get_axis(axis)) == 0:
  6601. result = Series(0, index=frame._get_agg_axis(axis))
  6602. else:
  6603. if frame._is_mixed_type or frame._data.any_extension_types:
  6604. # the or any_extension_types is really only hit for single-
  6605. # column frames with an extension array
  6606. result = notna(frame).sum(axis=axis)
  6607. else:
  6608. # GH13407
  6609. series_counts = notna(frame).sum(axis=axis)
  6610. counts = series_counts.values
  6611. result = Series(counts, index=frame._get_agg_axis(axis))
  6612. return result.astype("int64")
  6613. def _count_level(self, level, axis=0, numeric_only=False):
  6614. if numeric_only:
  6615. frame = self._get_numeric_data()
  6616. else:
  6617. frame = self
  6618. count_axis = frame._get_axis(axis)
  6619. agg_axis = frame._get_agg_axis(axis)
  6620. if not isinstance(count_axis, ABCMultiIndex):
  6621. raise TypeError(
  6622. f"Can only count levels on hierarchical {self._get_axis_name(axis)}."
  6623. )
  6624. if frame._is_mixed_type:
  6625. # Since we have mixed types, calling notna(frame.values) might
  6626. # upcast everything to object
  6627. mask = notna(frame).values
  6628. else:
  6629. # But use the speedup when we have homogeneous dtypes
  6630. mask = notna(frame.values)
  6631. if axis == 1:
  6632. # We're transposing the mask rather than frame to avoid potential
  6633. # upcasts to object, which induces a ~20x slowdown
  6634. mask = mask.T
  6635. if isinstance(level, str):
  6636. level = count_axis._get_level_number(level)
  6637. level_name = count_axis._names[level]
  6638. level_index = count_axis.levels[level]._shallow_copy(name=level_name)
  6639. level_codes = ensure_int64(count_axis.codes[level])
  6640. counts = lib.count_level_2d(mask, level_codes, len(level_index), axis=0)
  6641. result = DataFrame(counts, index=level_index, columns=agg_axis)
  6642. if axis == 1:
  6643. # Undo our earlier transpose
  6644. return result.T
  6645. else:
  6646. return result
  6647. def _reduce(
  6648. self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
  6649. ):
  6650. if axis is None and filter_type == "bool":
  6651. labels = None
  6652. constructor = None
  6653. else:
  6654. # TODO: Make other agg func handle axis=None properly
  6655. axis = self._get_axis_number(axis)
  6656. labels = self._get_agg_axis(axis)
  6657. constructor = self._constructor
  6658. def f(x):
  6659. return op(x, axis=axis, skipna=skipna, **kwds)
  6660. def _get_data(axis_matters):
  6661. if filter_type is None or filter_type == "numeric":
  6662. data = self._get_numeric_data()
  6663. elif filter_type == "bool":
  6664. if axis_matters:
  6665. # GH#25101, GH#24434
  6666. data = self._get_bool_data() if axis == 0 else self
  6667. else:
  6668. data = self._get_bool_data()
  6669. else: # pragma: no cover
  6670. msg = (
  6671. f"Generating numeric_only data with filter_type {filter_type} "
  6672. "not supported."
  6673. )
  6674. raise NotImplementedError(msg)
  6675. return data
  6676. if numeric_only is not None and axis in [0, 1]:
  6677. df = self
  6678. if numeric_only is True:
  6679. df = _get_data(axis_matters=True)
  6680. if axis == 1:
  6681. df = df.T
  6682. axis = 0
  6683. out_dtype = "bool" if filter_type == "bool" else None
  6684. # After possibly _get_data and transposing, we are now in the
  6685. # simple case where we can use BlockManager._reduce
  6686. res = df._data.reduce(op, axis=1, skipna=skipna, **kwds)
  6687. assert isinstance(res, dict)
  6688. if len(res):
  6689. assert len(res) == max(list(res.keys())) + 1, res.keys()
  6690. out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype)
  6691. out.index = df.columns
  6692. return out
  6693. if numeric_only is None:
  6694. values = self.values
  6695. try:
  6696. result = f(values)
  6697. if filter_type == "bool" and is_object_dtype(values) and axis is None:
  6698. # work around https://github.com/numpy/numpy/issues/10489
  6699. # TODO: combine with hasattr(result, 'dtype') further down
  6700. # hard since we don't have `values` down there.
  6701. result = np.bool_(result)
  6702. except TypeError:
  6703. # e.g. in nanops trying to convert strs to float
  6704. # try by-column first
  6705. if filter_type is None and axis == 0:
  6706. # this can end up with a non-reduction
  6707. # but not always. if the types are mixed
  6708. # with datelike then need to make sure a series
  6709. # we only end up here if we have not specified
  6710. # numeric_only and yet we have tried a
  6711. # column-by-column reduction, where we have mixed type.
  6712. # So let's just do what we can
  6713. from pandas.core.apply import frame_apply
  6714. opa = frame_apply(
  6715. self, func=f, result_type="expand", ignore_failures=True
  6716. )
  6717. result = opa.get_result()
  6718. if result.ndim == self.ndim:
  6719. result = result.iloc[0]
  6720. return result
  6721. # TODO: why doesnt axis matter here?
  6722. data = _get_data(axis_matters=False)
  6723. with np.errstate(all="ignore"):
  6724. result = f(data.values)
  6725. labels = data._get_agg_axis(axis)
  6726. else:
  6727. if numeric_only:
  6728. data = _get_data(axis_matters=True)
  6729. values = data.values
  6730. labels = data._get_agg_axis(axis)
  6731. else:
  6732. values = self.values
  6733. result = f(values)
  6734. if hasattr(result, "dtype") and is_object_dtype(result.dtype):
  6735. try:
  6736. if filter_type is None or filter_type == "numeric":
  6737. result = result.astype(np.float64)
  6738. elif filter_type == "bool" and notna(result).all():
  6739. result = result.astype(np.bool_)
  6740. except (ValueError, TypeError):
  6741. # try to coerce to the original dtypes item by item if we can
  6742. if axis == 0:
  6743. result = coerce_to_dtypes(result, self.dtypes)
  6744. if constructor is not None:
  6745. result = Series(result, index=labels)
  6746. return result
  6747. def nunique(self, axis=0, dropna=True) -> Series:
  6748. """
  6749. Count distinct observations over requested axis.
  6750. Return Series with number of distinct observations. Can ignore NaN
  6751. values.
  6752. Parameters
  6753. ----------
  6754. axis : {0 or 'index', 1 or 'columns'}, default 0
  6755. The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
  6756. column-wise.
  6757. dropna : bool, default True
  6758. Don't include NaN in the counts.
  6759. Returns
  6760. -------
  6761. Series
  6762. See Also
  6763. --------
  6764. Series.nunique: Method nunique for Series.
  6765. DataFrame.count: Count non-NA cells for each column or row.
  6766. Examples
  6767. --------
  6768. >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
  6769. >>> df.nunique()
  6770. A 3
  6771. B 1
  6772. dtype: int64
  6773. >>> df.nunique(axis=1)
  6774. 0 1
  6775. 1 2
  6776. 2 2
  6777. dtype: int64
  6778. """
  6779. return self.apply(Series.nunique, axis=axis, dropna=dropna)
  6780. def idxmin(self, axis=0, skipna=True) -> Series:
  6781. """
  6782. Return index of first occurrence of minimum over requested axis.
  6783. NA/null values are excluded.
  6784. Parameters
  6785. ----------
  6786. axis : {0 or 'index', 1 or 'columns'}, default 0
  6787. The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
  6788. skipna : bool, default True
  6789. Exclude NA/null values. If an entire row/column is NA, the result
  6790. will be NA.
  6791. Returns
  6792. -------
  6793. Series
  6794. Indexes of minima along the specified axis.
  6795. Raises
  6796. ------
  6797. ValueError
  6798. * If the row/column is empty
  6799. See Also
  6800. --------
  6801. Series.idxmin
  6802. Notes
  6803. -----
  6804. This method is the DataFrame version of ``ndarray.argmin``.
  6805. """
  6806. axis = self._get_axis_number(axis)
  6807. indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
  6808. index = self._get_axis(axis)
  6809. result = [index[i] if i >= 0 else np.nan for i in indices]
  6810. return Series(result, index=self._get_agg_axis(axis))
  6811. def idxmax(self, axis=0, skipna=True) -> Series:
  6812. """
  6813. Return index of first occurrence of maximum over requested axis.
  6814. NA/null values are excluded.
  6815. Parameters
  6816. ----------
  6817. axis : {0 or 'index', 1 or 'columns'}, default 0
  6818. The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
  6819. skipna : bool, default True
  6820. Exclude NA/null values. If an entire row/column is NA, the result
  6821. will be NA.
  6822. Returns
  6823. -------
  6824. Series
  6825. Indexes of maxima along the specified axis.
  6826. Raises
  6827. ------
  6828. ValueError
  6829. * If the row/column is empty
  6830. See Also
  6831. --------
  6832. Series.idxmax
  6833. Notes
  6834. -----
  6835. This method is the DataFrame version of ``ndarray.argmax``.
  6836. """
  6837. axis = self._get_axis_number(axis)
  6838. indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
  6839. index = self._get_axis(axis)
  6840. result = [index[i] if i >= 0 else np.nan for i in indices]
  6841. return Series(result, index=self._get_agg_axis(axis))
  6842. def _get_agg_axis(self, axis_num):
  6843. """
  6844. Let's be explicit about this.
  6845. """
  6846. if axis_num == 0:
  6847. return self.columns
  6848. elif axis_num == 1:
  6849. return self.index
  6850. else:
  6851. raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
  6852. def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame":
  6853. """
  6854. Get the mode(s) of each element along the selected axis.
  6855. The mode of a set of values is the value that appears most often.
  6856. It can be multiple values.
  6857. Parameters
  6858. ----------
  6859. axis : {0 or 'index', 1 or 'columns'}, default 0
  6860. The axis to iterate over while searching for the mode:
  6861. * 0 or 'index' : get mode of each column
  6862. * 1 or 'columns' : get mode of each row.
  6863. numeric_only : bool, default False
  6864. If True, only apply to numeric columns.
  6865. dropna : bool, default True
  6866. Don't consider counts of NaN/NaT.
  6867. .. versionadded:: 0.24.0
  6868. Returns
  6869. -------
  6870. DataFrame
  6871. The modes of each column or row.
  6872. See Also
  6873. --------
  6874. Series.mode : Return the highest frequency value in a Series.
  6875. Series.value_counts : Return the counts of values in a Series.
  6876. Examples
  6877. --------
  6878. >>> df = pd.DataFrame([('bird', 2, 2),
  6879. ... ('mammal', 4, np.nan),
  6880. ... ('arthropod', 8, 0),
  6881. ... ('bird', 2, np.nan)],
  6882. ... index=('falcon', 'horse', 'spider', 'ostrich'),
  6883. ... columns=('species', 'legs', 'wings'))
  6884. >>> df
  6885. species legs wings
  6886. falcon bird 2 2.0
  6887. horse mammal 4 NaN
  6888. spider arthropod 8 0.0
  6889. ostrich bird 2 NaN
  6890. By default, missing values are not considered, and the mode of wings
  6891. are both 0 and 2. The second row of species and legs contains ``NaN``,
  6892. because they have only one mode, but the DataFrame has two rows.
  6893. >>> df.mode()
  6894. species legs wings
  6895. 0 bird 2.0 0.0
  6896. 1 NaN NaN 2.0
  6897. Setting ``dropna=False`` ``NaN`` values are considered and they can be
  6898. the mode (like for wings).
  6899. >>> df.mode(dropna=False)
  6900. species legs wings
  6901. 0 bird 2 NaN
  6902. Setting ``numeric_only=True``, only the mode of numeric columns is
  6903. computed, and columns of other types are ignored.
  6904. >>> df.mode(numeric_only=True)
  6905. legs wings
  6906. 0 2.0 0.0
  6907. 1 NaN 2.0
  6908. To compute the mode over columns and not rows, use the axis parameter:
  6909. >>> df.mode(axis='columns', numeric_only=True)
  6910. 0 1
  6911. falcon 2.0 NaN
  6912. horse 4.0 NaN
  6913. spider 0.0 8.0
  6914. ostrich 2.0 NaN
  6915. """
  6916. data = self if not numeric_only else self._get_numeric_data()
  6917. def f(s):
  6918. return s.mode(dropna=dropna)
  6919. return data.apply(f, axis=axis)
  6920. def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
  6921. """
  6922. Return values at the given quantile over requested axis.
  6923. Parameters
  6924. ----------
  6925. q : float or array-like, default 0.5 (50% quantile)
  6926. Value between 0 <= q <= 1, the quantile(s) to compute.
  6927. axis : {0, 1, 'index', 'columns'} (default 0)
  6928. Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
  6929. numeric_only : bool, default True
  6930. If False, the quantile of datetime and timedelta data will be
  6931. computed as well.
  6932. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
  6933. This optional parameter specifies the interpolation method to use,
  6934. when the desired quantile lies between two data points `i` and `j`:
  6935. * linear: `i + (j - i) * fraction`, where `fraction` is the
  6936. fractional part of the index surrounded by `i` and `j`.
  6937. * lower: `i`.
  6938. * higher: `j`.
  6939. * nearest: `i` or `j` whichever is nearest.
  6940. * midpoint: (`i` + `j`) / 2.
  6941. Returns
  6942. -------
  6943. Series or DataFrame
  6944. If ``q`` is an array, a DataFrame will be returned where the
  6945. index is ``q``, the columns are the columns of self, and the
  6946. values are the quantiles.
  6947. If ``q`` is a float, a Series will be returned where the
  6948. index is the columns of self and the values are the quantiles.
  6949. See Also
  6950. --------
  6951. core.window.Rolling.quantile: Rolling quantile.
  6952. numpy.percentile: Numpy function to compute the percentile.
  6953. Examples
  6954. --------
  6955. >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
  6956. ... columns=['a', 'b'])
  6957. >>> df.quantile(.1)
  6958. a 1.3
  6959. b 3.7
  6960. Name: 0.1, dtype: float64
  6961. >>> df.quantile([.1, .5])
  6962. a b
  6963. 0.1 1.3 3.7
  6964. 0.5 2.5 55.0
  6965. Specifying `numeric_only=False` will also compute the quantile of
  6966. datetime and timedelta data.
  6967. >>> df = pd.DataFrame({'A': [1, 2],
  6968. ... 'B': [pd.Timestamp('2010'),
  6969. ... pd.Timestamp('2011')],
  6970. ... 'C': [pd.Timedelta('1 days'),
  6971. ... pd.Timedelta('2 days')]})
  6972. >>> df.quantile(0.5, numeric_only=False)
  6973. A 1.5
  6974. B 2010-07-02 12:00:00
  6975. C 1 days 12:00:00
  6976. Name: 0.5, dtype: object
  6977. """
  6978. validate_percentile(q)
  6979. data = self._get_numeric_data() if numeric_only else self
  6980. axis = self._get_axis_number(axis)
  6981. is_transposed = axis == 1
  6982. if is_transposed:
  6983. data = data.T
  6984. if len(data.columns) == 0:
  6985. # GH#23925 _get_numeric_data may have dropped all columns
  6986. cols = Index([], name=self.columns.name)
  6987. if is_list_like(q):
  6988. return self._constructor([], index=q, columns=cols)
  6989. return self._constructor_sliced([], index=cols, name=q, dtype=np.float64)
  6990. result = data._data.quantile(
  6991. qs=q, axis=1, interpolation=interpolation, transposed=is_transposed
  6992. )
  6993. if result.ndim == 2:
  6994. result = self._constructor(result)
  6995. else:
  6996. result = self._constructor_sliced(result, name=q)
  6997. if is_transposed:
  6998. result = result.T
  6999. return result
  7000. def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame":
  7001. """
  7002. Cast to DatetimeIndex of timestamps, at *beginning* of period.
  7003. Parameters
  7004. ----------
  7005. freq : str, default frequency of PeriodIndex
  7006. Desired frequency.
  7007. how : {'s', 'e', 'start', 'end'}
  7008. Convention for converting period to timestamp; start of period
  7009. vs. end.
  7010. axis : {0 or 'index', 1 or 'columns'}, default 0
  7011. The axis to convert (the index by default).
  7012. copy : bool, default True
  7013. If False then underlying input data is not copied.
  7014. Returns
  7015. -------
  7016. DataFrame with DatetimeIndex
  7017. """
  7018. new_data = self._data
  7019. if copy:
  7020. new_data = new_data.copy()
  7021. axis = self._get_axis_number(axis)
  7022. if axis == 0:
  7023. new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))
  7024. elif axis == 1:
  7025. new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
  7026. else: # pragma: no cover
  7027. raise AssertionError(f"Axis must be 0 or 1. Got {axis}")
  7028. return self._constructor(new_data)
  7029. def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame":
  7030. """
  7031. Convert DataFrame from DatetimeIndex to PeriodIndex.
  7032. Convert DataFrame from DatetimeIndex to PeriodIndex with desired
  7033. frequency (inferred from index if not passed).
  7034. Parameters
  7035. ----------
  7036. freq : str, default
  7037. Frequency of the PeriodIndex.
  7038. axis : {0 or 'index', 1 or 'columns'}, default 0
  7039. The axis to convert (the index by default).
  7040. copy : bool, default True
  7041. If False then underlying input data is not copied.
  7042. Returns
  7043. -------
  7044. TimeSeries with PeriodIndex
  7045. """
  7046. new_data = self._data
  7047. if copy:
  7048. new_data = new_data.copy()
  7049. axis = self._get_axis_number(axis)
  7050. if axis == 0:
  7051. new_data.set_axis(1, self.index.to_period(freq=freq))
  7052. elif axis == 1:
  7053. new_data.set_axis(0, self.columns.to_period(freq=freq))
  7054. else: # pragma: no cover
  7055. raise AssertionError(f"Axis must be 0 or 1. Got {axis}")
  7056. return self._constructor(new_data)
  7057. def isin(self, values) -> "DataFrame":
  7058. """
  7059. Whether each element in the DataFrame is contained in values.
  7060. Parameters
  7061. ----------
  7062. values : iterable, Series, DataFrame or dict
  7063. The result will only be true at a location if all the
  7064. labels match. If `values` is a Series, that's the index. If
  7065. `values` is a dict, the keys must be the column names,
  7066. which must match. If `values` is a DataFrame,
  7067. then both the index and column labels must match.
  7068. Returns
  7069. -------
  7070. DataFrame
  7071. DataFrame of booleans showing whether each element in the DataFrame
  7072. is contained in values.
  7073. See Also
  7074. --------
  7075. DataFrame.eq: Equality test for DataFrame.
  7076. Series.isin: Equivalent method on Series.
  7077. Series.str.contains: Test if pattern or regex is contained within a
  7078. string of a Series or Index.
  7079. Examples
  7080. --------
  7081. >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
  7082. ... index=['falcon', 'dog'])
  7083. >>> df
  7084. num_legs num_wings
  7085. falcon 2 2
  7086. dog 4 0
  7087. When ``values`` is a list check whether every value in the DataFrame
  7088. is present in the list (which animals have 0 or 2 legs or wings)
  7089. >>> df.isin([0, 2])
  7090. num_legs num_wings
  7091. falcon True True
  7092. dog False True
  7093. When ``values`` is a dict, we can pass values to check for each
  7094. column separately:
  7095. >>> df.isin({'num_wings': [0, 3]})
  7096. num_legs num_wings
  7097. falcon False False
  7098. dog False True
  7099. When ``values`` is a Series or DataFrame the index and column must
  7100. match. Note that 'falcon' does not match based on the number of legs
  7101. in df2.
  7102. >>> other = pd.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]},
  7103. ... index=['spider', 'falcon'])
  7104. >>> df.isin(other)
  7105. num_legs num_wings
  7106. falcon True True
  7107. dog False False
  7108. """
  7109. if isinstance(values, dict):
  7110. from pandas.core.reshape.concat import concat
  7111. values = collections.defaultdict(list, values)
  7112. return concat(
  7113. (
  7114. self.iloc[:, [i]].isin(values[col])
  7115. for i, col in enumerate(self.columns)
  7116. ),
  7117. axis=1,
  7118. )
  7119. elif isinstance(values, Series):
  7120. if not values.index.is_unique:
  7121. raise ValueError("cannot compute isin with a duplicate axis.")
  7122. return self.eq(values.reindex_like(self), axis="index")
  7123. elif isinstance(values, DataFrame):
  7124. if not (values.columns.is_unique and values.index.is_unique):
  7125. raise ValueError("cannot compute isin with a duplicate axis.")
  7126. return self.eq(values.reindex_like(self))
  7127. else:
  7128. if not is_list_like(values):
  7129. raise TypeError(
  7130. "only list-like or dict-like objects are allowed "
  7131. "to be passed to DataFrame.isin(), "
  7132. f"you passed a {repr(type(values).__name__)}"
  7133. )
  7134. return DataFrame(
  7135. algorithms.isin(self.values.ravel(), values).reshape(self.shape),
  7136. self.index,
  7137. self.columns,
  7138. )
  7139. # ----------------------------------------------------------------------
  7140. # Add plotting methods to DataFrame
  7141. plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
  7142. hist = pandas.plotting.hist_frame
  7143. boxplot = pandas.plotting.boxplot_frame
  7144. sparse = CachedAccessor("sparse", SparseFrameAccessor)
  7145. DataFrame._setup_axes(
  7146. ["index", "columns"],
  7147. docs={
  7148. "index": "The index (row labels) of the DataFrame.",
  7149. "columns": "The column labels of the DataFrame.",
  7150. },
  7151. )
  7152. DataFrame._add_numeric_operations()
  7153. DataFrame._add_series_or_dataframe_operations()
  7154. ops.add_flex_arithmetic_methods(DataFrame)
  7155. ops.add_special_arithmetic_methods(DataFrame)
  7156. def _from_nested_dict(data):
  7157. # TODO: this should be seriously cythonized
  7158. new_data = {}
  7159. for index, s in data.items():
  7160. for col, v in s.items():
  7161. new_data[col] = new_data.get(col, {})
  7162. new_data[col][index] = v
  7163. return new_data
  7164. def _put_str(s, space):
  7165. return str(s)[:space].ljust(space)