Learn_Numpy.py 179 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866
  1. import re
  2. from os.path import split as path_split
  3. from os.path import exists, basename, splitext
  4. from os import mkdir, getcwd
  5. import tarfile
  6. from sklearn.svm import SVC, SVR # SVC是svm分类,SVR是svm回归
  7. from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
  8. from sklearn.manifold import TSNE
  9. from sklearn.neural_network import MLPClassifier, MLPRegressor
  10. from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as Lda
  11. from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA, NMF
  12. from sklearn.impute import SimpleImputer
  13. from sklearn.preprocessing import *
  14. from sklearn.feature_selection import *
  15. from sklearn.metrics import *
  16. from sklearn.ensemble import (
  17. RandomForestClassifier,
  18. RandomForestRegressor,
  19. GradientBoostingClassifier,
  20. GradientBoostingRegressor,
  21. )
  22. import numpy as np
  23. import matplotlib.pyplot as plt
  24. from pandas import DataFrame, read_csv
  25. from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
  26. from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
  27. from sklearn.linear_model import *
  28. from sklearn.model_selection import train_test_split
  29. import joblib
  30. from scipy.fftpack import fft, ifft # 快速傅里叶变换
  31. from scipy import optimize
  32. from scipy.cluster.hierarchy import dendrogram, ward
  33. from pyecharts.components import Table as TableFisrt # 绘制表格
  34. from pyecharts.options.series_options import JsCode
  35. from pyecharts.charts import Tab as tab_First
  36. from pyecharts.charts import *
  37. from pyecharts import options as opts
  38. from pyecharts.components import Image
  39. from pyecharts.globals import CurrentConfig
  40. CurrentConfig.ONLINE_HOST = f"{getcwd()}/assets/"
  41. # 设置
  42. np.set_printoptions(threshold=np.inf)
  43. global_setting = dict(
  44. toolbox_opts=opts.ToolboxOpts(is_show=True),
  45. legend_opts=opts.LegendOpts(pos_bottom="3%", type_="scroll"),
  46. )
  47. global_not_legend = dict(
  48. toolbox_opts=opts.ToolboxOpts(is_show=True),
  49. legend_opts=opts.LegendOpts(is_show=False),
  50. )
  51. label_setting = dict(label_opts=opts.LabelOpts(is_show=False))
  52. more_global = False # 是否使用全部特征绘图
  53. all_global = True # 是否导出charts
  54. csv_global = True # 是否导出CSV
  55. clf_global = True # 是否导出模型
  56. tar_global = True # 是否打包tar
  57. new_dir_global = True # 是否新建目录
  58. class Tab(tab_First):
  59. def __init__(self, *args, **kwargs):
  60. super(Tab, self).__init__(*args, **kwargs)
  61. self.element = {} # 记录tab组成元素 name:charts
  62. def add(self, chart, tab_name):
  63. self.element[tab_name] = chart
  64. return super(Tab, self).add(chart, tab_name)
  65. def render(
  66. self,
  67. path: str = "render.html",
  68. template_name: str = "simple_tab.html",
  69. *args,
  70. **kwargs,
  71. ) -> str:
  72. if all_global:
  73. render_dir = path_split(path)[0]
  74. for i in self.element:
  75. self.element[i].render(render_dir + "/" + i + ".html")
  76. return super(Tab, self).render(path, template_name, *args, **kwargs)
  77. class Table(TableFisrt):
  78. def __init__(self, *args, **kwargs):
  79. super(Table, self).__init__(*args, **kwargs)
  80. self.HEADERS = []
  81. self.ROWS = [[]]
  82. def add(self, headers, rows, attributes=None):
  83. if len(rows) == 1:
  84. new_headers = ["数据类型", "数据"]
  85. new_rows = list(zip(headers, rows[0]))
  86. self.HEADERS = new_headers
  87. self.ROWS = new_rows
  88. return super().add(new_headers, new_rows, attributes)
  89. else:
  90. self.HEADERS = headers
  91. self.ROWS = rows
  92. return super().add(headers, rows, attributes)
  93. def render(self, path="render.html", *args, **kwargs,) -> str:
  94. if csv_global:
  95. save_dir, name = path_split(path)
  96. name = splitext(name)[0]
  97. try:
  98. DataFrame(self.ROWS, columns=self.HEADERS).to_csv(
  99. save_dir + "/" + name + ".csv"
  100. )
  101. except BaseException:
  102. pass
  103. return super().render(path, *args, **kwargs)
  104. def make_list(first, end, num=35):
  105. n = num / (end - first)
  106. if n == 0:
  107. n = 1
  108. re = []
  109. n_first = first * n
  110. n_end = end * n
  111. while n_first <= n_end:
  112. cul = n_first / n
  113. re.append(round(cul, 2))
  114. n_first += 1
  115. return re
  116. def list_filter(original_list, num=70):
  117. if len(original_list) <= num:
  118. return original_list
  119. n = int(num / len(original_list))
  120. re = original_list[::n]
  121. return re
  122. def prediction_boundary(x_range, x_means, predict_func, data_type): # 绘制回归型x-x热力图
  123. # r是绘图大小列表,x_means是其余值,Predict_Func是预测方法回调
  124. # a-特征x,b-特征x-1,c-其他特征
  125. render_list = []
  126. if len(x_means) == 1:
  127. return render_list
  128. for i in range(len(x_means)):
  129. for j in range(len(x_means)):
  130. if j <= i:
  131. continue
  132. a_range = x_range[j]
  133. a_type = data_type[j]
  134. b_range = x_range[i]
  135. b_type = data_type[i]
  136. if a_type == 1:
  137. a_list = make_list(a_range[0], a_range[1], 70)
  138. else:
  139. a_list = list_filter(a_range) # 可以接受最大为70
  140. if b_type == 1:
  141. b_list = make_list(b_range[0], b_range[1], 35)
  142. else:
  143. b_list = list_filter(b_range) # 可以接受最大为70
  144. a = np.array([i for i in a_list for _ in b_list]).T
  145. b = np.array([i for _ in a_list for i in b_list]).T
  146. data = np.array([x_means for _ in a_list for i in b_list])
  147. data[:, j] = a
  148. data[:, i] = b
  149. y_data = predict_func(data)[0].tolist()
  150. value = [[float(a[i]), float(b[i]), y_data[i]] for i in range(len(a))]
  151. c = (
  152. HeatMap()
  153. .add_xaxis(np.unique(a))
  154. # value的第一个数值是x
  155. .add_yaxis(f"数据", np.unique(b), value, **label_setting)
  156. .set_global_opts(
  157. title_opts=opts.TitleOpts(title="预测热力图"),
  158. **global_not_legend,
  159. yaxis_opts=opts.AxisOpts(
  160. is_scale=True, type_="category"
  161. ), # 'category'
  162. xaxis_opts=opts.AxisOpts(is_scale=True, type_="category"),
  163. visualmap_opts=opts.VisualMapOpts(
  164. is_show=True,
  165. max_=int(max(y_data)) + 1,
  166. min_=int(min(y_data)),
  167. pos_right="3%",
  168. ),
  169. ) # 显示
  170. )
  171. render_list.append(c)
  172. return render_list
  173. # 绘制回归型x-x热力图[更多对比]
  174. def prediction_boundary_more(x_range, x_means, predict_func, data_type):
  175. # r是绘图大小列表,x_means是其余值,Predict_Func是预测方法回调
  176. # a-特征x,b-特征x-1,c-其他特征
  177. render_list = []
  178. if len(x_means) == 1:
  179. return render_list
  180. for i in range(len(x_means)):
  181. if i == 0:
  182. continue
  183. a_range = x_range[i - 1]
  184. a_type = data_type[i - 1]
  185. b_range = x_range[i]
  186. b_type = data_type[i]
  187. if a_type == 1:
  188. a_list = make_list(a_range[0], a_range[1], 70)
  189. else:
  190. a_list = list_filter(a_range) # 可以接受最大为70
  191. if b_type == 1:
  192. b_list = make_list(b_range[0], b_range[1], 35)
  193. else:
  194. b_list = list_filter(b_range) # 可以接受最大为70
  195. a = np.array([i for i in a_list for _ in b_list]).T
  196. b = np.array([i for _ in a_list for i in b_list]).T
  197. data = np.array([x_means for _ in a_list for i in b_list])
  198. data[:, i - 1] = a
  199. data[:, i] = b
  200. y_data = predict_func(data)[0].tolist()
  201. value = [[float(a[i]), float(b[i]), y_data[i]] for i in range(len(a))]
  202. c = (
  203. HeatMap()
  204. .add_xaxis(np.unique(a))
  205. # value的第一个数值是x
  206. .add_yaxis(f"数据", np.unique(b), value, **label_setting)
  207. .set_global_opts(
  208. title_opts=opts.TitleOpts(title="预测热力图"),
  209. **global_not_legend,
  210. yaxis_opts=opts.AxisOpts(is_scale=True, type_="category"), # 'category'
  211. xaxis_opts=opts.AxisOpts(is_scale=True, type_="category"),
  212. visualmap_opts=opts.VisualMapOpts(
  213. is_show=True,
  214. max_=int(max(y_data)) + 1,
  215. min_=int(min(y_data)),
  216. pos_right="3%",
  217. ),
  218. ) # 显示
  219. )
  220. render_list.append(c)
  221. return render_list
  222. def decision_boundary(
  223. x_range, x_means, predict_func, class_list, data_type, no_unknow=False
  224. ): # 绘制分类型预测图x-x热力图
  225. # r是绘图大小列表,x_means是其余值,Predict_Func是预测方法回调,class_是分类,add_o是可以合成的图
  226. # a-特征x,b-特征x-1,c-其他特征
  227. # 规定,i-1是x轴,a是x轴,x_1是x轴
  228. class_dict = dict(zip(class_list, [i for i in range(len(class_list))]))
  229. if not no_unknow:
  230. map_dict = [{"min": -1.5, "max": -0.5, "label": "未知"}] # 分段显示
  231. else:
  232. map_dict = []
  233. for i in class_dict:
  234. map_dict.append(
  235. {"min": class_dict[i] - 0.5, "max": class_dict[i] + 0.5, "label": str(i)}
  236. )
  237. render_list = []
  238. if len(x_means) == 1:
  239. a_range = x_range[0]
  240. if data_type[0] == 1:
  241. a_list = make_list(a_range[0], a_range[1], 70)
  242. else:
  243. a_list = a_range
  244. a = np.array([i for i in a_list]).reshape(-1, 1)
  245. y_data = predict_func(a)[0].tolist()
  246. value = [[0, float(a[i]), class_dict.get(y_data[i], -1)] for i in range(len(a))]
  247. c = (
  248. HeatMap()
  249. .add_xaxis(["None"])
  250. # value的第一个数值是x
  251. .add_yaxis(f"数据", np.unique(a), value, **label_setting)
  252. .set_global_opts(
  253. title_opts=opts.TitleOpts(title="预测热力图"),
  254. **global_not_legend,
  255. yaxis_opts=opts.AxisOpts(is_scale=True, type_="category"), # 'category'
  256. xaxis_opts=opts.AxisOpts(is_scale=True, type_="category"),
  257. visualmap_opts=opts.VisualMapOpts(
  258. is_show=True,
  259. max_=max(class_dict.values()),
  260. min_=-1,
  261. is_piecewise=True,
  262. pieces=map_dict,
  263. orient="horizontal",
  264. pos_bottom="3%",
  265. ),
  266. )
  267. )
  268. render_list.append(c)
  269. return render_list
  270. # 如果x_means长度不等于1则执行下面
  271. for i in range(len(x_means)):
  272. if i == 0:
  273. continue
  274. a_range = x_range[i - 1]
  275. a_type = data_type[i - 1]
  276. b_range = x_range[i]
  277. b_type = data_type[i]
  278. if a_type == 1:
  279. a_list = make_list(a_range[0], a_range[1], 70)
  280. else:
  281. a_list = a_range
  282. if b_type == 1:
  283. rb = make_list(b_range[0], b_range[1], 35)
  284. else:
  285. rb = b_range
  286. a = np.array([i for i in a_list for _ in rb]).T
  287. b = np.array([i for _ in a_list for i in rb]).T
  288. data = np.array([x_means for _ in a_list for i in rb])
  289. data[:, i - 1] = a
  290. data[:, i] = b
  291. y_data = predict_func(data)[0].tolist()
  292. value = [
  293. [float(a[i]), float(b[i]), class_dict.get(y_data[i], -1)]
  294. for i in range(len(a))
  295. ]
  296. c = (
  297. HeatMap()
  298. .add_xaxis(np.unique(a))
  299. # value的第一个数值是x
  300. .add_yaxis(f"数据", np.unique(b), value, **label_setting)
  301. .set_global_opts(
  302. title_opts=opts.TitleOpts(title="预测热力图"),
  303. **global_not_legend,
  304. yaxis_opts=opts.AxisOpts(is_scale=True, type_="category"), # 'category'
  305. xaxis_opts=opts.AxisOpts(is_scale=True, type_="category"),
  306. visualmap_opts=opts.VisualMapOpts(
  307. is_show=True,
  308. max_=max(class_dict.values()),
  309. min_=-1,
  310. is_piecewise=True,
  311. pieces=map_dict,
  312. orient="horizontal",
  313. pos_bottom="3%",
  314. ),
  315. )
  316. )
  317. render_list.append(c)
  318. return render_list
  319. # 绘制分类型预测图x-x热力图
  320. def decision_boundary_more(
  321. x_range, x_means, predict_func, class_list, data_type, no_unknow=False
  322. ):
  323. # r是绘图大小列表,x_means是其余值,Predict_Func是预测方法回调,class_是分类,add_o是可以合成的图
  324. # a-特征x,b-特征x-1,c-其他特征
  325. # 规定,i-1是x轴,a是x轴,x_1是x轴
  326. class_dict = dict(zip(class_list, [i for i in range(len(class_list))]))
  327. if not no_unknow:
  328. map_dict = [{"min": -1.5, "max": -0.5, "label": "未知"}] # 分段显示
  329. else:
  330. map_dict = []
  331. for i in class_dict:
  332. map_dict.append(
  333. {"min": class_dict[i] - 0.5, "max": class_dict[i] + 0.5, "label": str(i)}
  334. )
  335. render_list = []
  336. if len(x_means) == 1:
  337. return decision_boundary(
  338. x_range, x_means, predict_func, class_list, data_type, no_unknow
  339. )
  340. # 如果x_means长度不等于1则执行下面
  341. for i in range(len(x_means)):
  342. for j in range(len(x_means)):
  343. if j <= i:
  344. continue
  345. a_range = x_range[j]
  346. a_type = data_type[j]
  347. b_range = x_range[i]
  348. b_type = data_type[i]
  349. if a_type == 1:
  350. a_range = make_list(a_range[0], a_range[1], 70)
  351. else:
  352. a_range = a_range
  353. if b_type == 1:
  354. b_range = make_list(b_range[0], b_range[1], 35)
  355. else:
  356. b_range = b_range
  357. a = np.array([i for i in a_range for _ in b_range]).T
  358. b = np.array([i for _ in a_range for i in b_range]).T
  359. data = np.array([x_means for _ in a_range for i in b_range])
  360. data[:, j] = a
  361. data[:, i] = b
  362. y_data = predict_func(data)[0].tolist()
  363. value = [
  364. [float(a[i]), float(b[i]), class_dict.get(y_data[i], -1)]
  365. for i in range(len(a))
  366. ]
  367. c = (
  368. HeatMap()
  369. .add_xaxis(np.unique(a))
  370. # value的第一个数值是x
  371. .add_yaxis(f"数据", np.unique(b), value, **label_setting)
  372. .set_global_opts(
  373. title_opts=opts.TitleOpts(title="预测热力图"),
  374. **global_not_legend,
  375. yaxis_opts=opts.AxisOpts(
  376. is_scale=True, type_="category"
  377. ), # 'category'
  378. xaxis_opts=opts.AxisOpts(is_scale=True, type_="category"),
  379. visualmap_opts=opts.VisualMapOpts(
  380. is_show=True,
  381. max_=max(class_dict.values()),
  382. min_=-1,
  383. is_piecewise=True,
  384. pieces=map_dict,
  385. orient="horizontal",
  386. pos_bottom="3%",
  387. ),
  388. )
  389. )
  390. render_list.append(c)
  391. return render_list
  392. def see_tree(tree_file_dir):
  393. node_regex = re.compile(r'^([0-9]+) \[label="(.+)"\] ;$') # 匹配节点正则表达式
  394. link_regex = re.compile("^([0-9]+) -> ([0-9]+) (.*);$") # 匹配节点正则表达式
  395. node_dict = {}
  396. link_list = []
  397. with open(tree_file_dir, "r") as f: # 貌似必须分开w和r
  398. for i in f:
  399. try:
  400. regex_result = re.findall(node_regex, i)[0]
  401. if regex_result[0] != "":
  402. try:
  403. v = float(regex_result[0])
  404. except BaseException:
  405. v = 0
  406. node_dict[regex_result[0]] = {
  407. "name": regex_result[1].replace("\\n", "\n"),
  408. "value": v,
  409. "children": [],
  410. }
  411. continue
  412. except BaseException:
  413. pass
  414. try:
  415. regex_result = re.findall(link_regex, i)[0]
  416. if regex_result[0] != "" and regex_result[1] != "":
  417. link_list.append((regex_result[0], regex_result[1]))
  418. except BaseException:
  419. pass
  420. father_list = [] # 已经有父亲的list
  421. for i in link_list:
  422. father = i[0] # 父节点
  423. son = i[1] # 子节点
  424. try:
  425. node_dict[father]["children"].append(node_dict[son])
  426. father_list.append(son)
  427. except BaseException:
  428. pass
  429. father = list(set(node_dict.keys()) - set(father_list))
  430. c = (
  431. Tree()
  432. .add("", [node_dict[father[0]]], is_roam=True)
  433. .set_global_opts(
  434. title_opts=opts.TitleOpts(title="决策树可视化"),
  435. toolbox_opts=opts.ToolboxOpts(is_show=True),
  436. )
  437. )
  438. return c
  439. def make_tab(heard, row):
  440. return Table().add(headers=heard, rows=row)
  441. def coefficient_scatter_plot(w_heard, w):
  442. c = (
  443. Scatter()
  444. .add_xaxis(w_heard)
  445. .add_yaxis("", w, **label_setting)
  446. .set_global_opts(title_opts=opts.TitleOpts(title="系数w散点图"), **global_setting)
  447. )
  448. return c
  449. def coefficient_bar_plot(w_heard, w):
  450. c = (
  451. Bar()
  452. .add_xaxis(w_heard)
  453. .add_yaxis("", abs(w).tolist(), **label_setting)
  454. .set_global_opts(title_opts=opts.TitleOpts(title="系数w柱状图"), **global_setting)
  455. )
  456. return c
  457. def is_continuous(data: np.array, f: float = 0.1):
  458. data = data.tolist()
  459. l: list = np.unique(data).tolist()
  460. try:
  461. re = len(l) / len(data) >= f or len(data) <= 3
  462. return re
  463. except BaseException:
  464. return False
  465. def quick_stats(x_data):
  466. statistics_assistant = CategoricalData()
  467. for i in range(len(x_data)):
  468. x1 = x_data[i] # x坐标
  469. statistics_assistant(x1)
  470. return statistics_assistant
  471. # 根据不同类别绘制x-x分类散点图(可以绘制更多的图)
  472. def training_visualization_more_no_center(x_data, class_list, y_data):
  473. x_data = x_data.transpose
  474. if len(x_data) == 1:
  475. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  476. statistics_assistant = quick_stats(x_data)
  477. render_list = []
  478. for i in range(len(x_data)):
  479. for a in range(len(x_data)):
  480. if a <= i:
  481. continue
  482. x1 = x_data[i] # x坐标
  483. x1_is_continuous = is_continuous(x1)
  484. x2 = x_data[a] # y坐标
  485. x2_is_continuous = is_continuous(x2)
  486. base_render = None # 旧的C
  487. for class_num in range(len(class_list)):
  488. now_class = class_list[class_num]
  489. plot_x1 = x1[y_data == now_class].tolist()
  490. plot_x2 = x2[y_data == now_class]
  491. axis_x2 = np.unique(plot_x2)
  492. plot_x2 = x2[y_data == now_class].tolist()
  493. # x与散点图不同,这里是纵坐标
  494. c = (
  495. Scatter()
  496. .add_xaxis(plot_x2)
  497. .add_yaxis(f"{now_class}", plot_x1, **label_setting)
  498. .set_global_opts(
  499. title_opts=opts.TitleOpts(title=f"[{a}-{i}]训练数据散点图"),
  500. **global_setting,
  501. yaxis_opts=opts.AxisOpts(
  502. type_="value" if x1_is_continuous else "category",
  503. is_scale=True,
  504. ),
  505. xaxis_opts=opts.AxisOpts(
  506. type_="value" if x2_is_continuous else "category",
  507. is_scale=True,
  508. ),
  509. )
  510. )
  511. c.add_xaxis(axis_x2)
  512. if base_render is None:
  513. base_render = c
  514. else:
  515. base_render = base_render.overlap(c)
  516. render_list.append(base_render)
  517. means, x_range, data_type = statistics_assistant.get()
  518. return render_list, means, x_range, data_type
  519. # 根据不同类别绘制x-x分类散点图(可以绘制更多的图)
  520. def training_visualization_more(x_data, class_list, y_data, center):
  521. x_data = x_data.transpose
  522. if len(x_data) == 1:
  523. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  524. statistics_assistant = quick_stats(x_data)
  525. render_list = []
  526. for i in range(len(x_data)):
  527. for a in range(len(x_data)):
  528. if a <= i:
  529. continue
  530. x1 = x_data[i] # x坐标
  531. x1_is_continuous = is_continuous(x1)
  532. x2 = x_data[a] # y坐标
  533. x2_is_continuous = is_continuous(x2)
  534. base_render = None # 旧的C
  535. for class_num in range(len(class_list)):
  536. now_class = class_list[class_num]
  537. plot_x1 = x1[y_data == now_class].tolist()
  538. plot_x2 = x2[y_data == now_class]
  539. axis_x2 = np.unique(plot_x2)
  540. plot_x2 = x2[y_data == now_class].tolist()
  541. # x与散点图不同,这里是纵坐标
  542. c = (
  543. Scatter()
  544. .add_xaxis(plot_x2)
  545. .add_yaxis(f"{now_class}", plot_x1, **label_setting)
  546. .set_global_opts(
  547. title_opts=opts.TitleOpts(title=f"[{a}-{i}]训练数据散点图"),
  548. **global_setting,
  549. yaxis_opts=opts.AxisOpts(
  550. type_="value" if x1_is_continuous else "category",
  551. is_scale=True,
  552. ),
  553. xaxis_opts=opts.AxisOpts(
  554. type_="value" if x2_is_continuous else "category",
  555. is_scale=True,
  556. ),
  557. )
  558. )
  559. c.add_xaxis(axis_x2)
  560. # 添加簇中心
  561. try:
  562. center_x2 = [center[class_num][a]]
  563. except BaseException:
  564. center_x2 = [0]
  565. b = (
  566. Scatter()
  567. .add_xaxis(center_x2)
  568. .add_yaxis(
  569. f"[{now_class}]中心",
  570. [center[class_num][i]],
  571. **label_setting,
  572. symbol="triangle",
  573. )
  574. .set_global_opts(
  575. title_opts=opts.TitleOpts(title="簇中心"),
  576. **global_setting,
  577. yaxis_opts=opts.AxisOpts(
  578. type_="value" if x1_is_continuous else "category",
  579. is_scale=True,
  580. ),
  581. xaxis_opts=opts.AxisOpts(
  582. type_="value" if x2_is_continuous else "category",
  583. is_scale=True,
  584. ),
  585. )
  586. )
  587. c.overlap(b)
  588. if base_render is None:
  589. base_render = c
  590. else:
  591. base_render = base_render.overlap(c)
  592. render_list.append(base_render)
  593. means, x_range, data_type = statistics_assistant.get()
  594. return render_list, means, x_range, data_type
  595. # 根据不同类别绘制x-x分类散点图(可以绘制更多的图)
  596. def training_visualization_center(x_data, class_data, y_data, center):
  597. x_data = x_data.transpose
  598. if len(x_data) == 1:
  599. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  600. statistics_assistant = quick_stats(x_data)
  601. render_list = []
  602. for i in range(len(x_data)):
  603. if i == 0:
  604. continue
  605. x1 = x_data[i] # x坐标
  606. x1_is_continuous = is_continuous(x1)
  607. x2 = x_data[i - 1] # y坐标
  608. x2_is_continuous = is_continuous(x2)
  609. base_render = None # 旧的C
  610. for class_num in range(len(class_data)):
  611. n_class = class_data[class_num]
  612. x_1 = x1[y_data == n_class].tolist()
  613. x_2 = x2[y_data == n_class]
  614. x_2_new = np.unique(x_2)
  615. x_2 = x2[y_data == n_class].tolist()
  616. # x与散点图不同,这里是纵坐标
  617. c = (
  618. Scatter()
  619. .add_xaxis(x_2)
  620. .add_yaxis(f"{n_class}", x_1, **label_setting)
  621. .set_global_opts(
  622. title_opts=opts.TitleOpts(title=f"[{i-1}-{i}]训练数据散点图"),
  623. **global_setting,
  624. yaxis_opts=opts.AxisOpts(
  625. type_="value" if x1_is_continuous else "category", is_scale=True
  626. ),
  627. xaxis_opts=opts.AxisOpts(
  628. type_="value" if x2_is_continuous else "category", is_scale=True
  629. ),
  630. )
  631. )
  632. c.add_xaxis(x_2_new)
  633. # 添加簇中心
  634. try:
  635. center_x_2 = [center[class_num][i - 1]]
  636. except BaseException:
  637. center_x_2 = [0]
  638. b = (
  639. Scatter()
  640. .add_xaxis(center_x_2)
  641. .add_yaxis(
  642. f"[{n_class}]中心",
  643. [center[class_num][i]],
  644. **label_setting,
  645. symbol="triangle",
  646. )
  647. .set_global_opts(
  648. title_opts=opts.TitleOpts(title="簇中心"),
  649. **global_setting,
  650. yaxis_opts=opts.AxisOpts(
  651. type_="value" if x1_is_continuous else "category", is_scale=True
  652. ),
  653. xaxis_opts=opts.AxisOpts(
  654. type_="value" if x2_is_continuous else "category", is_scale=True
  655. ),
  656. )
  657. )
  658. c.overlap(b)
  659. if base_render is None:
  660. base_render = c
  661. else:
  662. base_render = base_render.overlap(c)
  663. render_list.append(base_render)
  664. means, x_range, data_type = statistics_assistant.get()
  665. return render_list, means, x_range, data_type
  666. def training_visualization(x_data, class_, y_data): # 根据不同类别绘制x-x分类散点图
  667. x_data = x_data.transpose
  668. if len(x_data) == 1:
  669. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  670. statistics_assistant = quick_stats(x_data)
  671. render_list = []
  672. for i in range(len(x_data)):
  673. if i == 0:
  674. continue
  675. x1 = x_data[i] # x坐标
  676. x1_is_continuous = is_continuous(x1)
  677. x2 = x_data[i - 1] # y坐标
  678. x2_is_continuous = is_continuous(x2)
  679. render_list = None # 旧的C
  680. for now_class in class_:
  681. plot_x1 = x1[y_data == now_class].tolist()
  682. plot_x2 = x2[y_data == now_class]
  683. axis_x2 = np.unique(plot_x2)
  684. plot_x2 = x2[y_data == now_class].tolist()
  685. # x与散点图不同,这里是纵坐标
  686. c = (
  687. Scatter()
  688. .add_xaxis(plot_x2)
  689. .add_yaxis(f"{now_class}", plot_x1, **label_setting)
  690. .set_global_opts(
  691. title_opts=opts.TitleOpts(title="训练数据散点图"),
  692. **global_setting,
  693. yaxis_opts=opts.AxisOpts(
  694. type_="value" if x1_is_continuous else "category", is_scale=True
  695. ),
  696. xaxis_opts=opts.AxisOpts(
  697. type_="value" if x2_is_continuous else "category", is_scale=True
  698. ),
  699. )
  700. )
  701. c.add_xaxis(axis_x2)
  702. if render_list is None:
  703. render_list = c
  704. else:
  705. render_list = render_list.overlap(c)
  706. render_list.append(render_list)
  707. means, x_range, data_type = statistics_assistant.get()
  708. return render_list, means, x_range, data_type
  709. def training_visualization_no_class(x_data): # 根据绘制x-x分类散点图(无类别)
  710. x_data = x_data.transpose
  711. if len(x_data) == 1:
  712. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  713. statistics_assistant = quick_stats(x_data)
  714. render_list = []
  715. for i in range(len(x_data)):
  716. if i == 0:
  717. continue
  718. x1 = x_data[i] # x坐标
  719. x1_is_continuous = is_continuous(x1)
  720. x2 = x_data[i - 1] # y坐标
  721. x2_is_continuous = is_continuous(x2)
  722. x2_only = np.unique(x2)
  723. # x与散点图不同,这里是纵坐标
  724. c = (
  725. Scatter()
  726. .add_xaxis(x2)
  727. .add_yaxis("", x1.tolist(), **label_setting)
  728. .set_global_opts(
  729. title_opts=opts.TitleOpts(title="训练数据散点图"),
  730. **global_not_legend,
  731. yaxis_opts=opts.AxisOpts(
  732. type_="value" if x1_is_continuous else "category", is_scale=True
  733. ),
  734. xaxis_opts=opts.AxisOpts(
  735. type_="value" if x2_is_continuous else "category", is_scale=True
  736. ),
  737. )
  738. )
  739. c.add_xaxis(x2_only)
  740. render_list.append(c)
  741. means, x_range, data_type = statistics_assistant.get()
  742. return render_list, means, x_range, data_type
  743. def training_w(
  744. x_data, class_list, y_data, w_list, b_list, x_means: list
  745. ): # 针对分类问题绘制决策边界
  746. x_data = x_data.transpose
  747. if len(x_data) == 1:
  748. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  749. render_list = []
  750. x_means.append(0)
  751. x_means = np.array(x_means)
  752. for i in range(len(x_data)):
  753. if i == 0:
  754. continue
  755. x1_is_continuous = is_continuous(x_data[i])
  756. x2 = x_data[i - 1] # y坐标
  757. x2_is_continuous = is_continuous(x2)
  758. o_c = None # 旧的C
  759. for class_num in range(len(class_list)):
  760. n_class = class_list[class_num]
  761. x2_only = np.unique(x2[y_data == n_class])
  762. # x与散点图不同,这里是纵坐标
  763. # 加入这个判断是为了解决sklearn历史遗留问题
  764. if len(class_list) == 2: # 二分类问题
  765. if class_num == 0:
  766. continue
  767. w = w_list[0]
  768. b = b_list[0]
  769. else: # 多分类问题
  770. w = w_list[class_num]
  771. b = b_list[class_num]
  772. if x2_is_continuous:
  773. x2_only = np.array(make_list(x2_only.min(), x2_only.max(), 5))
  774. w = np.append(w, 0)
  775. y_data = (
  776. -(x2_only * w[i - 1]) / w[i]
  777. + b
  778. + (x_means[: i - 1] * w[: i - 1]).sum()
  779. + (x_means[i + 1 :] * w[i + 1 :]).sum()
  780. ) # 假设除了两个特征意外,其余特征均为means列表的数值
  781. c = (
  782. Line()
  783. .add_xaxis(x2_only)
  784. .add_yaxis(
  785. f"决策边界:{n_class}=>[{i}]",
  786. y_data.tolist(),
  787. is_smooth=True,
  788. **label_setting,
  789. )
  790. .set_global_opts(
  791. title_opts=opts.TitleOpts(title=f"系数w曲线"),
  792. **global_setting,
  793. yaxis_opts=opts.AxisOpts(
  794. type_="value" if x1_is_continuous else "category", is_scale=True
  795. ),
  796. xaxis_opts=opts.AxisOpts(
  797. type_="value" if x2_is_continuous else "category", is_scale=True
  798. ),
  799. )
  800. )
  801. if o_c is None:
  802. o_c = c
  803. else:
  804. o_c = o_c.overlap(c)
  805. # 下面不要接任何代码,因为上面会continue
  806. render_list.append(o_c)
  807. return render_list
  808. def regress_w(x_data, w_data: np.array, intercept_b, x_means: list): # 针对回归问题(y-x图)
  809. x_data = x_data.transpose
  810. if len(x_data) == 1:
  811. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  812. render_list = []
  813. x_means.append(0) # 确保mean[i+1]不会超出index
  814. x_means = np.array(x_means)
  815. w_data = np.append(w_data, 0)
  816. for i in range(len(x_data)):
  817. x1 = x_data[i]
  818. x1_is_continuous = is_continuous(x1)
  819. if x1_is_continuous:
  820. x1 = np.array(make_list(x1.min(), x1.max(), 5))
  821. x1_only = np.unique(x1)
  822. # 假设除了两个特征意外,其余特征均为means列表的数值
  823. y_data = (
  824. x1_only * w_data[i]
  825. + intercept_b
  826. + (x_means[:i] * w_data[:i]).sum()
  827. + (x_means[i + 1 :] * w_data[i + 1 :]).sum()
  828. )
  829. y_is_continuous = is_continuous(y_data)
  830. c = (
  831. Line()
  832. .add_xaxis(x1_only)
  833. .add_yaxis(f"拟合结果=>[{i}]", y_data.tolist(), is_smooth=True, **label_setting)
  834. .set_global_opts(
  835. title_opts=opts.TitleOpts(title=f"系数w曲线"),
  836. **global_setting,
  837. yaxis_opts=opts.AxisOpts(
  838. type_="value" if y_is_continuous else None, is_scale=True
  839. ),
  840. xaxis_opts=opts.AxisOpts(
  841. type_="value" if x1_is_continuous else None, is_scale=True
  842. ),
  843. )
  844. )
  845. render_list.append(c)
  846. return render_list
  847. def regress_visualization(x_data, y_data): # y-x数据图
  848. x_data = x_data.transpose
  849. y_is_continuous = is_continuous(y_data)
  850. statistics_assistant = quick_stats(x_data)
  851. render_list = []
  852. try:
  853. visualmap_opts = opts.VisualMapOpts(
  854. is_show=True,
  855. max_=int(y_data.max()) + 1,
  856. min_=int(y_data.min()),
  857. pos_right="3%",
  858. )
  859. except BaseException:
  860. visualmap_opts = None
  861. y_is_continuous = False
  862. for i in range(len(x_data)):
  863. x1 = x_data[i] # x坐标
  864. x1_is_continuous = is_continuous(x1)
  865. # 不转换成list因为保持dtype的精度,否则绘图会出现各种问题(数值重复)
  866. if not y_is_continuous and x1_is_continuous:
  867. y_is_continuous, x1_is_continuous = x1_is_continuous, y_is_continuous
  868. x1, y_data = y_data, x1
  869. c = (
  870. Scatter()
  871. .add_xaxis(x1.tolist()) # 研究表明,这个是横轴
  872. .add_yaxis("数据", y_data.tolist(), **label_setting)
  873. .set_global_opts(
  874. title_opts=opts.TitleOpts(title="预测类型图"),
  875. **global_setting,
  876. yaxis_opts=opts.AxisOpts(
  877. type_="value" if y_is_continuous else "category", is_scale=True
  878. ),
  879. xaxis_opts=opts.AxisOpts(
  880. type_="value" if x1_is_continuous else "category", is_scale=True
  881. ),
  882. visualmap_opts=visualmap_opts,
  883. )
  884. )
  885. c.add_xaxis(np.unique(x1))
  886. render_list.append(c)
  887. means, x_range, data_type = statistics_assistant.get()
  888. return render_list, means, x_range, data_type
  889. def feature_visualization(x_data, data_name=""): # x-x数据图
  890. seeting = global_setting if data_name else global_not_legend
  891. x_data = x_data.transpose
  892. only = False
  893. if len(x_data) == 1:
  894. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  895. only = True
  896. render_list = []
  897. for i in range(len(x_data)):
  898. for a in range(len(x_data)):
  899. if a <= i:
  900. continue # 重复内容,跳过
  901. x1 = x_data[i] # x坐标
  902. x1_is_continuous = is_continuous(x1)
  903. x2 = x_data[a] # y坐标
  904. x2_is_continuous = is_continuous(x2)
  905. x2_only = np.unique(x2)
  906. if only:
  907. x2_is_continuous = False
  908. # x与散点图不同,这里是纵坐标
  909. c = (
  910. Scatter()
  911. .add_xaxis(x2)
  912. .add_yaxis(data_name, x1, **label_setting)
  913. .set_global_opts(
  914. title_opts=opts.TitleOpts(title=f"[{i}-{a}]数据散点图"),
  915. **seeting,
  916. yaxis_opts=opts.AxisOpts(
  917. type_="value" if x1_is_continuous else "category", is_scale=True
  918. ),
  919. xaxis_opts=opts.AxisOpts(
  920. type_="value" if x2_is_continuous else "category", is_scale=True
  921. ),
  922. )
  923. )
  924. c.add_xaxis(x2_only)
  925. render_list.append(c)
  926. return render_list
  927. def feature_visualization_format(x_data, data_name=""): # x-x数据图
  928. seeting = global_setting if data_name else global_not_legend
  929. x_data = x_data.transpose
  930. only = False
  931. if len(x_data) == 1:
  932. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  933. only = True
  934. render_list = []
  935. for i in range(len(x_data)):
  936. for a in range(len(x_data)):
  937. if a <= i:
  938. continue # 重复内容,跳过(a读取的是i后面的)
  939. x1 = x_data[i] # x坐标
  940. x1_is_continuous = is_continuous(x1)
  941. x2 = x_data[a] # y坐标
  942. x2_is_continuous = is_continuous(x2)
  943. x2_only = np.unique(x2)
  944. x1_list = x1.astype(np.str).tolist()
  945. for i in range(len(x1_list)):
  946. x1_list[i] = [x1_list[i], f"特征{i}"]
  947. if only:
  948. x2_is_continuous = False
  949. # x与散点图不同,这里是纵坐标
  950. c = (
  951. Scatter()
  952. .add_xaxis(x2)
  953. .add_yaxis(data_name, x1_list, **label_setting)
  954. .set_global_opts(
  955. title_opts=opts.TitleOpts(title=f"[{i}-{a}]数据散点图"),
  956. **seeting,
  957. yaxis_opts=opts.AxisOpts(
  958. type_="value" if x1_is_continuous else "category", is_scale=True
  959. ),
  960. xaxis_opts=opts.AxisOpts(
  961. type_="value" if x2_is_continuous else "category", is_scale=True
  962. ),
  963. tooltip_opts=opts.TooltipOpts(
  964. is_show=True, axis_pointer_type="cross", formatter="{c}"
  965. ),
  966. )
  967. )
  968. c.add_xaxis(x2_only)
  969. render_list.append(c)
  970. return render_list
  971. def discrete_feature_visualization(x_data, data_name=""): # 必定离散x-x数据图
  972. seeting = global_setting if data_name else global_not_legend
  973. x_data = x_data.transpose
  974. if len(x_data) == 1:
  975. x_data = np.array([x_data[0], np.zeros(len(x_data[0]))])
  976. render_list = []
  977. for i in range(len(x_data)):
  978. for a in range(len(x_data)):
  979. if a <= i:
  980. continue # 重复内容,跳过
  981. x1 = x_data[i] # x坐标
  982. x2 = x_data[a] # y坐标
  983. x2_only = np.unique(x2)
  984. # x与散点图不同,这里是纵坐标
  985. c = (
  986. Scatter()
  987. .add_xaxis(x2)
  988. .add_yaxis(data_name, x1, **label_setting)
  989. .set_global_opts(
  990. title_opts=opts.TitleOpts(title=f"[{i}-{a}]数据散点图"),
  991. **seeting,
  992. yaxis_opts=opts.AxisOpts(type_="category", is_scale=True),
  993. xaxis_opts=opts.AxisOpts(type_="category", is_scale=True),
  994. )
  995. )
  996. c.add_xaxis(x2_only)
  997. render_list.append(c)
  998. return render_list
  999. def conversion_control(y_data, x_data, tab): # 合并两x-x图
  1000. if isinstance(x_data, np.ndarray) and isinstance(y_data, np.ndarray):
  1001. get_x = feature_visualization(x_data, "原数据") # 原来
  1002. get_y = feature_visualization(y_data, "转换数据") # 转换
  1003. for i in range(len(get_x)):
  1004. tab.add(get_x[i].overlap(get_y[i]), f"[{i}]数据x-x散点图")
  1005. return tab
  1006. def conversion_separate(y_data, x_data, tab): # 并列显示两x-x图
  1007. if isinstance(x_data, np.ndarray) and isinstance(y_data, np.ndarray):
  1008. get_x = feature_visualization(x_data, "原数据") # 原来
  1009. get_y = feature_visualization(y_data, "转换数据") # 转换
  1010. for i in range(len(get_x)):
  1011. try:
  1012. tab.add(get_x[i], f"[{i}]数据x-x散点图")
  1013. except IndexError:
  1014. pass
  1015. try:
  1016. tab.add(get_y[i], f"[{i}]变维数据x-x散点图")
  1017. except IndexError:
  1018. pass
  1019. return tab
  1020. def conversion_separate_format(y_data, tab): # 并列显示两x-x图
  1021. if isinstance(y_data, np.ndarray):
  1022. get_y = feature_visualization_format(y_data, "转换数据") # 转换
  1023. for i in range(len(get_y)):
  1024. tab.add(get_y[i], f"[{i}]变维数据x-x散点图")
  1025. return tab
  1026. def conversion_separate_wh(w_array, h_array, tab): # 并列显示两x-x图
  1027. if isinstance(w_array, np.ndarray) and isinstance(w_array, np.ndarray):
  1028. get_x = feature_visualization_format(w_array, "W矩阵数据") # 原来
  1029. get_y = feature_visualization(
  1030. h_array.transpose, "H矩阵数据"
  1031. ) # 转换(先转T,再转T变回原样,W*H是横对列)
  1032. for i in range(len(get_x)):
  1033. try:
  1034. tab.add(get_x[i], f"[{i}]W矩阵x-x散点图")
  1035. except IndexError:
  1036. pass
  1037. try:
  1038. tab.add(get_y[i], f"[{i}]H.T矩阵x-x散点图")
  1039. except IndexError:
  1040. pass
  1041. return tab
  1042. def make_bar(name, value, tab): # 绘制柱状图
  1043. c = (
  1044. Bar()
  1045. .add_xaxis([f"[{i}]特征" for i in range(len(value))])
  1046. .add_yaxis(name, value, **label_setting)
  1047. .set_global_opts(title_opts=opts.TitleOpts(title="系数w柱状图"), **global_setting)
  1048. )
  1049. tab.add(c, name)
  1050. def judging_digits(num: (int, float)): # 查看小数位数
  1051. a = str(abs(num)).split(".")[0]
  1052. if a == "":
  1053. raise ValueError
  1054. return len(a)
  1055. class Learner:
  1056. def __init__(self, *args, **kwargs):
  1057. self.numpy_dict = {} # name:numpy
  1058. self.fucn_add() # 制作Func_Dic
  1059. def add_form(self, data: np.array, name):
  1060. name = f"{name}[{len(self.numpy_dict)}]"
  1061. self.numpy_dict[name] = data
  1062. def read_csv(self, file_dir, name, encoding="utf-8", str_must=False, sep=","):
  1063. dtype = np.str if str_must else np.float
  1064. dataframe = read_csv(file_dir, encoding=encoding, delimiter=sep, header=None)
  1065. try:
  1066. data = dataframe.to_numpy(dtype=dtype)
  1067. except ValueError:
  1068. data = dataframe.to_numpy(dtype=np.str)
  1069. if data.ndim == 1:
  1070. data = np.expand_dims(data, axis=1)
  1071. self.add_form(data, name)
  1072. return data
  1073. def add_python(self, python_file, sheet_name):
  1074. name = {}
  1075. name.update(globals().copy())
  1076. name.update(locals().copy())
  1077. exec(python_file, name)
  1078. exec("get = Creat()", name)
  1079. if isinstance(name["get"], np.array):
  1080. get = name["get"]
  1081. else:
  1082. try:
  1083. get = np.array(name["get"])
  1084. except BaseException:
  1085. get = np.array([name["get"]])
  1086. self.add_form(get, sheet_name)
  1087. return get
  1088. def get_form(self) -> dict:
  1089. return self.numpy_dict.copy()
  1090. def get_sheet(self, name) -> np.array:
  1091. return self.numpy_dict[name].copy()
  1092. def to_csv(self, save_dir: str, name, sep) -> str:
  1093. get = self.get_sheet(name)
  1094. np.savetxt(save_dir, get, delimiter=sep)
  1095. return save_dir
  1096. def to_html_one(self, name, html_dir=""):
  1097. if html_dir == "":
  1098. html_dir = f"{name}.html"
  1099. get = self.get_sheet(name)
  1100. if get.ndim == 1:
  1101. get = np.expand_dims(get, axis=1)
  1102. get = get.tolist()
  1103. for i in range(len(get)):
  1104. get[i] = [i + 1] + get[i]
  1105. headers = [i for i in range(len(get[0]))]
  1106. table = TableFisrt()
  1107. table.add(headers, get).set_global_opts(
  1108. title_opts=opts.ComponentTitleOpts(
  1109. title=f"表格:{name}", subtitle="CoTan~机器学习:查看数据"
  1110. )
  1111. )
  1112. table.render(html_dir)
  1113. return html_dir
  1114. def to_html(self, name, html_dir="", html_type=0):
  1115. if html_dir == "":
  1116. html_dir = f"{name}.html"
  1117. # 把要画的sheet放到第一个
  1118. sheet_dict = self.get_form()
  1119. del sheet_dict[name]
  1120. sheet_list = [name] + list(sheet_dict.keys())
  1121. class TabBase:
  1122. def __init__(self, q):
  1123. self.tab = q # 一个Tab
  1124. def render(self, render_dir):
  1125. return self.tab.render(render_dir)
  1126. # 生成一个显示页面
  1127. if html_type == 0:
  1128. class NewTab(TabBase):
  1129. def add(self, table, k, *f):
  1130. self.tab.add(table, k)
  1131. tab = NewTab(tab_First(page_title="CoTan:查看表格")) # 一个Tab
  1132. elif html_type == 1:
  1133. class NewTab(TabBase):
  1134. def add(self, table, *k):
  1135. self.tab.add(table)
  1136. tab = NewTab(Page(page_title="CoTan:查看表格", layout=Page.DraggablePageLayout))
  1137. else:
  1138. class NewTab(TabBase):
  1139. def add(self, table, *k):
  1140. self.tab.add(table)
  1141. tab = NewTab(Page(page_title="CoTan:查看表格", layout=Page.SimplePageLayout))
  1142. # 迭代添加内容
  1143. for name in sheet_list:
  1144. get = self.get_sheet(name)
  1145. if get.ndim == 1:
  1146. get = np.expand_dims(get, axis=1)
  1147. get = get.tolist()
  1148. for i in range(len(get)):
  1149. get[i] = [i + 1] + get[i]
  1150. headers = [i for i in range(len(get[0]))]
  1151. table = TableFisrt()
  1152. table.add(headers, get).set_global_opts(
  1153. title_opts=opts.ComponentTitleOpts(
  1154. title=f"表格:{name}", subtitle="CoTan~机器学习:查看数据"
  1155. )
  1156. )
  1157. tab.add(table, f"表格:{name}")
  1158. tab.render(html_dir)
  1159. return html_dir
  1160. def merge(self, name, axis=0): # aiis:0-横向合并(hstack),1-纵向合并(vstack),2-深度合并
  1161. sheet_list = []
  1162. for i in name:
  1163. sheet_list.append(self.get_sheet(i))
  1164. get = {0: np.hstack, 1: np.vstack, 2: np.dstack}[axis](sheet_list)
  1165. self.add_form(np.array(get), f"{name[0]}合成")
  1166. def split(self, name, split=2, axis=0): # aiis:0-横向分割(hsplit),1-纵向分割(vsplit)
  1167. sheet = self.get_sheet(name)
  1168. get = {0: np.hsplit, 1: np.vsplit, 2: np.dsplit}[axis](sheet, split)
  1169. for i in get:
  1170. self.add_form(i, f"{name[0]}分割")
  1171. def two_split(self, name, split, axis): # 二分切割(0-横向,1-纵向)
  1172. sheet = self.get_sheet(name)
  1173. try:
  1174. split = float(eval(split))
  1175. if split < 1:
  1176. split = int(split * len(sheet) if axis == 1 else len(sheet[0]))
  1177. else:
  1178. raise Exception
  1179. except BaseException:
  1180. split = int(split)
  1181. if axis == 0:
  1182. self.add_form(sheet[:, split:], f"{name[0]}分割")
  1183. self.add_form(sheet[:, :split], f"{name[0]}分割")
  1184. def deep(self, sheet: np.ndarray):
  1185. return sheet.ravel()
  1186. def down_ndim(self, sheet: np.ndarray): # 横向
  1187. down_list = []
  1188. for i in sheet:
  1189. down_list.append(i.ravel())
  1190. return np.array(down_list)
  1191. def longitudinal_down_ndim(self, sheet: np.ndarray): # 纵向
  1192. down_list = []
  1193. for i in range(len(sheet[0])):
  1194. down_list.append(sheet[:, i].ravel())
  1195. return np.array(down_list).T
  1196. def reval(self, name, axis): # axis:0-横向,1-纵向(带.T),2-深度
  1197. sheet = self.get_sheet(name)
  1198. self.add_form(
  1199. {0: self.down_ndim, 1: self.longitudinal_down_ndim, 2: self.deep}[axis](
  1200. sheet
  1201. ).copy(),
  1202. f"{name}伸展",
  1203. )
  1204. def del_ndim(self, name): # 删除无用维度
  1205. sheet = self.get_sheet(name)
  1206. self.add_form(np.squeeze(sheet), f"{name}降维")
  1207. def transpose(self, name, func: list):
  1208. sheet = self.get_sheet(name)
  1209. if sheet.ndim <= 2:
  1210. self.add_form(sheet.transpose.copy(), f"{name}.T")
  1211. else:
  1212. self.add_form(np.transpose(sheet, func).copy(), f"{name}.T")
  1213. def reshape(self, name, shape: list):
  1214. sheet = self.get_sheet(name)
  1215. self.add_form(sheet.reshape(shape).copy(), f"{name}.r")
  1216. def fucn_add(self):
  1217. self.func_dict = {
  1218. "abs": lambda x, y: np.abs(x),
  1219. "sqrt": lambda x, y: np.sqrt(x),
  1220. "pow": lambda x, y: x ** y,
  1221. "loge": lambda x, y: np.log(x),
  1222. "log10": lambda x, y: np.log10(x),
  1223. "ceil": lambda x, y: np.ceil(x),
  1224. "floor": lambda x, y: np.floor(x),
  1225. "rint": lambda x, y: np.rint(x),
  1226. "sin": lambda x, y: np.sin(x),
  1227. "cos": lambda x, y: np.cos(x),
  1228. "tan": lambda x, y: np.tan(x),
  1229. "tanh": lambda x, y: np.tanh(x),
  1230. "sinh": lambda x, y: np.sinh(x),
  1231. "cosh": lambda x, y: np.cosh(x),
  1232. "asin": lambda x, y: np.arcsin(x),
  1233. "acos": lambda x, y: np.arccos(x),
  1234. "atan": lambda x, y: np.arctan(x),
  1235. "atanh": lambda x, y: np.arctanh(x),
  1236. "asinh": lambda x, y: np.arcsinh(x),
  1237. "acosh": lambda x, y: np.arccosh(x),
  1238. "add": lambda x, y: x + y, # 矩阵或元素
  1239. "sub": lambda x, y: x - y, # 矩阵或元素
  1240. "mul": lambda x, y: np.multiply(x, y), # 元素级别
  1241. "matmul": lambda x, y: np.matmul(x, y), # 矩阵
  1242. "dot": lambda x, y: np.dot(x, y), # 矩阵
  1243. "div": lambda x, y: x / y,
  1244. "div_floor": lambda x, y: np.floor_divide(x, y),
  1245. "power": lambda x, y: np.power(x, y), # 元素级
  1246. }
  1247. def calculation_matrix(self, data, data_type, func):
  1248. if 1 not in data_type:
  1249. raise Exception
  1250. func = self.func_dict.get(func, lambda x, y: x)
  1251. args_data = []
  1252. for i in range(len(data)):
  1253. if data_type[i] == 0:
  1254. args_data.append(data[i])
  1255. else:
  1256. args_data.append(self.get_sheet(data[i]))
  1257. get = func(*args_data)
  1258. self.add_form(get, f"{func}({data[0]},{data[1]})")
  1259. return get
  1260. class StudyMachinebase:
  1261. def __init__(self, *args, **kwargs):
  1262. self.model = None
  1263. self.have_fit = False
  1264. self.have_predict = False
  1265. self.x_traindata = None
  1266. self.y_traindata = None
  1267. # 有监督学习专有的testData
  1268. self.x_testdata = None
  1269. self.y_testdata = None
  1270. # 记录这两个是为了克隆
  1271. def fit_model(self, x_data, y_data, split=0.3, increment=True, **kwargs):
  1272. y_data = y_data.ravel()
  1273. try:
  1274. if self.x_traindata is None or not increment:
  1275. raise Exception
  1276. self.x_traindata = np.vstack(x_data, self.x_traindata)
  1277. self.y_traindata = np.vstack(y_data, self.y_traindata)
  1278. except BaseException:
  1279. self.x_traindata = x_data.copy()
  1280. self.y_traindata = y_data.copy()
  1281. x_train, x_test, y_train, y_test = train_test_split(
  1282. x_data, y_data, test_size=split
  1283. )
  1284. try: # 增量式训练
  1285. if not increment:
  1286. raise Exception
  1287. self.model.partial_fit(x_data, y_data)
  1288. except BaseException:
  1289. self.model.fit(self.x_traindata, self.y_traindata)
  1290. train_score = self.model.score(x_train, y_train)
  1291. test_score = self.model.score(x_test, y_test)
  1292. self.have_fit = True
  1293. return train_score, test_score
  1294. def score(self, x_data, y_data):
  1295. score = self.model.score(x_data, y_data)
  1296. return score
  1297. def class_score(self, save_dir, x_data: np.ndarray, y_really: np.ndarray):
  1298. y_really = y_really.ravel()
  1299. y_predict = self.predict(x_data)[0]
  1300. accuracy = self._accuracy(y_predict, y_really)
  1301. recall, class_list = self._macro(y_predict, y_really)
  1302. precision, class_list = self._macro(y_predict, y_really, 1)
  1303. f1, class_list = self._macro(y_predict, y_really, 2)
  1304. confusion_matrix, class_list = self._confusion_matrix(y_predict, y_really)
  1305. kappa = self._kappa_score(y_predict, y_really)
  1306. tab = Tab()
  1307. def gauge_base(name: str, value: float) -> Gauge:
  1308. c = (
  1309. Gauge()
  1310. .add("", [(name, round(value * 100, 2))], min_=0, max_=100)
  1311. .set_global_opts(title_opts=opts.TitleOpts(title=name))
  1312. )
  1313. return c
  1314. tab.add(gauge_base("准确率", accuracy), "准确率")
  1315. tab.add(gauge_base("kappa", kappa), "kappa")
  1316. def bar_base(name, value) -> Bar:
  1317. c = (
  1318. Bar()
  1319. .add_xaxis(class_list)
  1320. .add_yaxis(name, value, **label_setting)
  1321. .set_global_opts(
  1322. title_opts=opts.TitleOpts(title=name), **global_setting
  1323. )
  1324. )
  1325. return c
  1326. tab.add(bar_base("精确率", precision.tolist()), "精确率")
  1327. tab.add(bar_base("召回率", recall.tolist()), "召回率")
  1328. tab.add(bar_base("F1", f1.tolist()), "F1")
  1329. def heatmap_base(name, value, max_, min_, show) -> HeatMap:
  1330. c = (
  1331. HeatMap()
  1332. .add_xaxis(class_list)
  1333. .add_yaxis(
  1334. name,
  1335. class_list,
  1336. value,
  1337. label_opts=opts.LabelOpts(is_show=show, position="inside"),
  1338. )
  1339. .set_global_opts(
  1340. title_opts=opts.TitleOpts(title=name),
  1341. **global_setting,
  1342. visualmap_opts=opts.VisualMapOpts(
  1343. max_=max_, min_=min_, pos_right="3%"
  1344. ),
  1345. )
  1346. )
  1347. return c
  1348. value = [
  1349. [class_list[i], class_list[j], float(confusion_matrix[i, j])]
  1350. for i in range(len(class_list))
  1351. for j in range(len(class_list))
  1352. ]
  1353. tab.add(
  1354. heatmap_base(
  1355. "混淆矩阵",
  1356. value,
  1357. float(confusion_matrix.max()),
  1358. float(confusion_matrix.min()),
  1359. len(class_list) < 7,
  1360. ),
  1361. "混淆矩阵",
  1362. )
  1363. des_to_csv(save_dir, "混淆矩阵", confusion_matrix, class_list, class_list)
  1364. des_to_csv(
  1365. save_dir, "评分", [precision, recall, f1], class_list, ["精确率", "召回率", "F1"]
  1366. )
  1367. save = save_dir + r"/分类模型评估.HTML"
  1368. tab.render(save)
  1369. return (save,)
  1370. def _accuracy(self, y_predict, y_really): # 准确率
  1371. return accuracy_score(y_really, y_predict)
  1372. def _macro(self, y_predict, y_really, func=0):
  1373. func = [recall_score, precision_score, f1_score] # 召回率,精确率和f1
  1374. class_ = np.unique(y_really).tolist()
  1375. result = func[func](y_really, y_predict, class_, average=None)
  1376. return result, class_
  1377. def _confusion_matrix(self, y_predict, y_really): # 混淆矩阵
  1378. class_ = np.unique(y_really).tolist()
  1379. return confusion_matrix(y_really, y_predict), class_
  1380. def _kappa_score(self, y_predict, y_really):
  1381. return cohen_kappa_score(y_really, y_predict)
  1382. def regression_score(self, save_dir, x_data: np.ndarray, y_really: np.ndarray):
  1383. y_really = y_really.ravel()
  1384. y_predict = self.predict(x_data)[0]
  1385. tab = Tab()
  1386. mse = self._mse(y_predict, y_really)
  1387. mae = self._mae(y_predict, y_really)
  1388. r2_score = self._r2_score(y_predict, y_really)
  1389. rmse = self._rmse(y_predict, y_really)
  1390. tab.add(
  1391. make_tab(["MSE", "MAE", "RMSE", "r2_Score"], [[mse, mae, rmse, r2_score]]),
  1392. "评估数据",
  1393. )
  1394. save = save_dir + r"/回归模型评估.HTML"
  1395. tab.render(save)
  1396. return (save,)
  1397. def clusters_score(self, save_dir, x_data: np.ndarray, *args):
  1398. y_predict = self.predict(x_data)[0]
  1399. tab = Tab()
  1400. coefficient, coefficient_array = self._coefficient_clustering(x_data, y_predict)
  1401. def gauge_base(name: str, value: float) -> Gauge:
  1402. c = (
  1403. Gauge()
  1404. .add(
  1405. "",
  1406. [(name, round(value * 100, 2))],
  1407. min_=0,
  1408. max_=10 ** (judging_digits(value * 100)),
  1409. )
  1410. .set_global_opts(title_opts=opts.TitleOpts(title=name))
  1411. )
  1412. return c
  1413. def bar_base(name, value, xaxis) -> Bar:
  1414. c = (
  1415. Bar()
  1416. .add_xaxis(xaxis)
  1417. .add_yaxis(name, value, **label_setting)
  1418. .set_global_opts(
  1419. title_opts=opts.TitleOpts(title=name), **global_setting
  1420. )
  1421. )
  1422. return c
  1423. tab.add(gauge_base("平均轮廓系数", coefficient), "平均轮廓系数")
  1424. def bar_(coefficient_array, name="数据轮廓系数"):
  1425. xaxis = [f"数据{i}" for i in range(len(coefficient_array))]
  1426. value = coefficient_array.tolist()
  1427. tab.add(bar_base(name, value, xaxis), name)
  1428. n = 20
  1429. if len(coefficient_array) <= n:
  1430. bar_(coefficient_array)
  1431. elif len(coefficient_array) <= n ** 2:
  1432. a = 0
  1433. while a <= len(coefficient_array):
  1434. b = a + n
  1435. if b >= len(coefficient_array):
  1436. b = len(coefficient_array) + 1
  1437. cofe_array = coefficient_array[a:b]
  1438. bar_(cofe_array, f"{a}-{b}数据轮廓系数")
  1439. a += n
  1440. else:
  1441. split = np.hsplit(coefficient_array, n)
  1442. a = 0
  1443. for cofe_array in split:
  1444. bar_(cofe_array, f"{a}%-{a + n}%数据轮廓系数")
  1445. a += n
  1446. save = save_dir + r"/聚类模型评估.HTML"
  1447. tab.render(save)
  1448. return (save,)
  1449. def _mse(self, y_predict, y_really): # 均方误差
  1450. return mean_squared_error(y_really, y_predict)
  1451. def _mae(self, y_predict, y_really): # 中值绝对误差
  1452. return median_absolute_error(y_really, y_predict)
  1453. def _r2_score(self, y_predict, y_really): # 中值绝对误差
  1454. return r2_score(y_really, y_predict)
  1455. def _rmse(self, y_predict, y_really): # 中值绝对误差
  1456. return self._mse(y_predict, y_really) ** 0.5
  1457. def _coefficient_clustering(self, x_data, y_predict):
  1458. means_score = silhouette_score(x_data, y_predict)
  1459. outline_score = silhouette_samples(x_data, y_predict)
  1460. return means_score, outline_score
  1461. def predict(self, x_data, *args, **kwargs):
  1462. self.x_testdata = x_data.copy()
  1463. y_predict = self.model.predict(x_data,)
  1464. self.y_testdata = y_predict.copy()
  1465. self.have_predict = True
  1466. return y_predict, "预测"
  1467. def data_visualization(self, save_dir, *args, **kwargs):
  1468. return (save_dir,)
  1469. class PrepBase(StudyMachinebase): # 不允许第二次训练
  1470. def __init__(self, *args, **kwargs):
  1471. super(PrepBase, self).__init__(*args, **kwargs)
  1472. self.model = None
  1473. def fit_model(self, x_data, y_data, increment=True, *args, **kwargs):
  1474. if not self.have_predict: # 不允许第二次训练
  1475. y_data = y_data.ravel()
  1476. try:
  1477. if self.x_traindata is None or not increment:
  1478. raise Exception
  1479. self.x_traindata = np.vstack(x_data, self.x_traindata)
  1480. self.y_traindata = np.vstack(y_data, self.y_traindata)
  1481. except BaseException:
  1482. self.x_traindata = x_data.copy()
  1483. self.y_traindata = y_data.copy()
  1484. try: # 增量式训练
  1485. if not increment:
  1486. raise Exception
  1487. self.model.partial_fit(x_data, y_data)
  1488. except BaseException:
  1489. self.model.fit(self.x_traindata, self.y_traindata)
  1490. self.have_fit = True
  1491. return "None", "None"
  1492. def predict(self, x_data, *args, **kwargs):
  1493. self.x_testdata = x_data.copy()
  1494. x_predict = self.model.transform(x_data)
  1495. self.y_testdata = x_predict.copy()
  1496. self.have_predict = True
  1497. return x_predict, "特征工程"
  1498. def score(self, x_data, y_data):
  1499. return "None" # 没有score
  1500. class Unsupervised(PrepBase): # 无监督,不允许第二次训练
  1501. def fit_model(self, x_data, increment=True, *args, **kwargs):
  1502. if not self.have_predict: # 不允许第二次训练
  1503. self.y_traindata = None
  1504. try:
  1505. if self.x_traindata is None or not increment:
  1506. raise Exception
  1507. self.x_traindata = np.vstack(x_data, self.x_traindata)
  1508. except BaseException:
  1509. self.x_traindata = x_data.copy()
  1510. try: # 增量式训练
  1511. if not increment:
  1512. raise Exception
  1513. self.model.partial_fit(x_data)
  1514. except BaseException:
  1515. self.model.fit(self.x_traindata, self.y_traindata)
  1516. self.have_fit = True
  1517. return "None", "None"
  1518. class UnsupervisedModel(PrepBase): # 无监督
  1519. def fit_model(self, x_data, increment=True, *args, **kwargs):
  1520. self.y_traindata = None
  1521. try:
  1522. if self.x_traindata is None or not increment:
  1523. raise Exception
  1524. self.x_traindata = np.vstack(x_data, self.x_traindata)
  1525. except BaseException:
  1526. self.x_traindata = x_data.copy()
  1527. try: # 增量式训练
  1528. if not increment:
  1529. raise Exception
  1530. self.model.partial_fit(x_data)
  1531. except BaseException:
  1532. self.model.fit(self.x_traindata, self.y_traindata)
  1533. self.have_fit = True
  1534. return "None", "None"
  1535. class ToPyebase(StudyMachinebase):
  1536. def __init__(self, model, *args, **kwargs):
  1537. super(ToPyebase, self).__init__(*args, **kwargs)
  1538. self.model = None
  1539. # 记录这两个是为了克隆
  1540. self.k = {}
  1541. self.model_Name = model
  1542. def fit_model(self, x_data, y_data, *args, **kwargs):
  1543. self.x_traindata = x_data.copy()
  1544. self.y_traindata = y_data.ravel().copy()
  1545. self.have_fit = True
  1546. return "None", "None"
  1547. def predict(self, x_data, *args, **kwargs):
  1548. self.have_predict = True
  1549. return np.array([]), "请使用训练"
  1550. def score(self, x_data, y_data):
  1551. return "None" # 没有score
  1552. def num_str(num, accuracy):
  1553. num = str(round(float(num), accuracy))
  1554. if len(num.replace(".", "")) == accuracy:
  1555. return num
  1556. n = num.split(".")
  1557. if len(n) == 0: # 无小数
  1558. return num + "." + "0" * (accuracy - len(num))
  1559. else:
  1560. return num + "0" * (accuracy - len(num) + 1) # len(num)多算了一位小数点
  1561. def des_to_csv(save_dir, name, data, columns=None, row=None):
  1562. save_dir = save_dir + "/" + name + ".csv"
  1563. DataFrame(data, columns=columns, index=row).to_csv(
  1564. save_dir,
  1565. header=False if columns is None else True,
  1566. index=False if row is None else True,
  1567. )
  1568. return data
  1569. class DataAnalysis(ToPyebase): # 数据分析
  1570. def data_visualization(self, save_dir, *args, **kwargs):
  1571. tab = Tab()
  1572. data = self.x_traindata
  1573. def cumulative_calculation(tab_data, func, name, render_tab):
  1574. sum_list = []
  1575. for i in range(len(tab_data)): # 按行迭代数据
  1576. sum_list.append([])
  1577. for a in range(len(tab_data[i])):
  1578. s = num_str(func(tab_data[: i + 1, a]), 8)
  1579. sum_list[-1].append(s)
  1580. des_to_csv(save_dir, f"{name}", sum_list)
  1581. render_tab.add(
  1582. make_tab([f"[{i}]" for i in range(len(sum_list[0]))], sum_list),
  1583. f"{name}",
  1584. )
  1585. def geometric_mean(x):
  1586. return np.power(np.prod(x), 1 / len(x)) # 几何平均数
  1587. def square_mean(x):
  1588. return np.sqrt(np.sum(np.power(x, 2)) / len(x)) # 平方平均数
  1589. def harmonic_mean(x):
  1590. return len(x) / np.sum(np.power(x, -1)) # 调和平均数
  1591. cumulative_calculation(data, np.sum, "累计求和", tab)
  1592. cumulative_calculation(data, np.var, "累计方差", tab)
  1593. cumulative_calculation(data, np.std, "累计标准差", tab)
  1594. cumulative_calculation(data, np.mean, "累计算术平均值", tab)
  1595. cumulative_calculation(data, geometric_mean, "累计几何平均值", tab)
  1596. cumulative_calculation(data, square_mean, "累计平方平均值", tab)
  1597. cumulative_calculation(data, harmonic_mean, "累计调和平均值", tab)
  1598. cumulative_calculation(data, np.median, "累计中位数", tab)
  1599. cumulative_calculation(data, np.max, "累计最大值", tab)
  1600. cumulative_calculation(data, np.min, "累计最小值", tab)
  1601. save = save_dir + r"/数据分析.HTML"
  1602. tab.render(save) # 生成HTML
  1603. return (save,)
  1604. class Corr(ToPyebase): # 相关性和协方差
  1605. def data_visualization(self, save_dir, *args, **kwargs):
  1606. tab = Tab()
  1607. data = DataFrame(self.x_traindata)
  1608. corr = data.corr().to_numpy() # 相关性
  1609. cov = data.cov().to_numpy() # 协方差
  1610. def heat_map(data, name: str, max_, min_):
  1611. x = [f"特征[{i}]" for i in range(len(data))]
  1612. y = [f"特征[{i}]" for i in range(len(data[0]))]
  1613. value = [
  1614. (f"特征[{i}]", f"特征[{j}]", float(data[i][j]))
  1615. for i in range(len(data))
  1616. for j in range(len(data[i]))
  1617. ]
  1618. c = (
  1619. HeatMap()
  1620. .add_xaxis(x)
  1621. # 如果特征太多则不显示标签
  1622. .add_yaxis(
  1623. f"数据",
  1624. y,
  1625. value,
  1626. label_opts=opts.LabelOpts(
  1627. is_show=True if len(x) <= 10 else False, position="inside"
  1628. ),
  1629. )
  1630. .set_global_opts(
  1631. title_opts=opts.TitleOpts(title="矩阵热力图"),
  1632. **global_not_legend,
  1633. yaxis_opts=opts.AxisOpts(
  1634. is_scale=True, type_="category"
  1635. ), # 'category'
  1636. xaxis_opts=opts.AxisOpts(is_scale=True, type_="category"),
  1637. visualmap_opts=opts.VisualMapOpts(
  1638. is_show=True, max_=max_, min_=min_, pos_right="3%"
  1639. ),
  1640. ) # 显示
  1641. )
  1642. tab.add(c, name)
  1643. heat_map(corr, "相关性热力图", 1, -1)
  1644. heat_map(cov, "协方差热力图", float(cov.max()), float(cov.min()))
  1645. des_to_csv(save_dir, f"相关性矩阵", corr)
  1646. des_to_csv(save_dir, f"协方差矩阵", cov)
  1647. save = save_dir + r"/数据相关性.HTML"
  1648. tab.render(save) # 生成HTML
  1649. return (save,)
  1650. class ViewData(ToPyebase): # 绘制预测型热力图
  1651. def __init__(
  1652. self, args_use, learner, *args, **kwargs
  1653. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  1654. super(ViewData, self).__init__(args_use, learner, *args, **kwargs)
  1655. self.model = learner.Model
  1656. self.Select_Model = None
  1657. self.have_fit = learner.have_Fit
  1658. self.model_Name = "Select_Model"
  1659. self.learner = learner
  1660. self.learner_name = learner.Model_Name
  1661. def fit_model(self, *args, **kwargs):
  1662. self.have_fit = True
  1663. return "None", "None"
  1664. def predict(self, x_data, add_func=None, *args, **kwargs):
  1665. x_traindata = self.learner.x_traindata
  1666. y_traindata = self.learner.y_traindata
  1667. x_name = self.learner_name
  1668. if x_traindata is not None:
  1669. add_func(x_traindata, f"{x_name}:x训练数据")
  1670. try:
  1671. x_testdata = self.x_testdata
  1672. if x_testdata is not None:
  1673. add_func(x_testdata, f"{x_name}:x测试数据")
  1674. except BaseException:
  1675. pass
  1676. try:
  1677. y_testdata = self.y_testdata.copy()
  1678. if y_testdata is not None:
  1679. add_func(y_testdata, f"{x_name}:y测试数据")
  1680. except BaseException:
  1681. pass
  1682. self.have_fit = True
  1683. if y_traindata is None:
  1684. return np.array([]), "y训练数据"
  1685. return y_traindata, "y训练数据"
  1686. def data_visualization(self, save_dir, *args, **kwargs):
  1687. return (save_dir,)
  1688. class MatrixScatter(ToPyebase): # 矩阵散点图
  1689. def data_visualization(self, save_dir, *args, **kwargs):
  1690. tab = Tab()
  1691. data = self.x_traindata
  1692. if data.ndim <= 2: # 维度为2
  1693. c = (
  1694. Scatter()
  1695. .add_xaxis([f"{i}" for i in range(data.shape[1])])
  1696. .set_global_opts(
  1697. title_opts=opts.TitleOpts(title=f"矩阵散点图"), **global_not_legend
  1698. )
  1699. )
  1700. if data.ndim == 2:
  1701. for num in range(len(data)):
  1702. i = data[num]
  1703. c.add_yaxis(f"{num}", [[f"{num}", x] for x in i], color="#FFFFFF")
  1704. else:
  1705. c.add_yaxis(f"0", [[0, x] for x in data], color="#FFFFFF")
  1706. c.set_series_opts(
  1707. label_opts=opts.LabelOpts(
  1708. is_show=True,
  1709. color="#000000",
  1710. position="inside",
  1711. formatter=JsCode("function(params){return params.data[2];}"),
  1712. )
  1713. )
  1714. elif data.ndim == 3:
  1715. c = Scatter3D().set_global_opts(
  1716. title_opts=opts.TitleOpts(title=f"矩阵散点图"), **global_not_legend
  1717. )
  1718. for num in range(len(data)):
  1719. i = data[num]
  1720. for s_num in range(len(i)):
  1721. s = i[s_num]
  1722. y_data = [[num, s_num, x, float(s[x])] for x in range(len(s))]
  1723. c.add(
  1724. f"{num}", y_data, zaxis3d_opts=opts.Axis3DOpts(type_="category")
  1725. )
  1726. c.set_series_opts(
  1727. label_opts=opts.LabelOpts(
  1728. is_show=True,
  1729. color="#000000",
  1730. position="inside",
  1731. formatter=JsCode("function(params){return params.data[3];}"),
  1732. )
  1733. )
  1734. else:
  1735. c = Scatter()
  1736. tab.add(c, "矩阵散点图")
  1737. save = save_dir + r"/矩阵散点图.HTML"
  1738. tab.render(save) # 生成HTML
  1739. return (save,)
  1740. class ClusterTree(ToPyebase): # 聚类树状图
  1741. def data_visualization(self, save_dir, *args, **kwargs):
  1742. tab = Tab()
  1743. x_data = self.x_traindata
  1744. linkage_array = ward(x_data) # self.y_traindata是结果
  1745. dendrogram(linkage_array)
  1746. plt.savefig(save_dir + r"/Cluster_graph.png")
  1747. image = Image()
  1748. image.add(src=save_dir + r"/Cluster_graph.png",).set_global_opts(
  1749. title_opts=opts.ComponentTitleOpts(title="聚类树状图")
  1750. )
  1751. tab.add(image, "聚类树状图")
  1752. save = save_dir + r"/聚类树状图.HTML"
  1753. tab.render(save) # 生成HTML
  1754. return (save,)
  1755. class ClassBar(ToPyebase): # 类型柱状图
  1756. def data_visualization(self, save_dir, *args, **kwargs):
  1757. tab = Tab()
  1758. x_data = self.x_traindata.transpose
  1759. y_data = self.y_traindata
  1760. class_ = np.unique(y_data).tolist() # 类型
  1761. class_list = []
  1762. for n_class in class_: # 生成class_list(class是1,,也就是二维的,下面会压缩成一维)
  1763. class_list.append(y_data == n_class)
  1764. for num_i in range(len(x_data)): # 迭代每一个特征
  1765. i = x_data[num_i]
  1766. i_con = is_continuous(i)
  1767. if i_con and len(i) >= 11:
  1768. # 存放绘图数据,每一层列表是一个类(leg),第二层是每个x_data
  1769. c_list = [[0] * 10 for _ in class_list]
  1770. start = i.min()
  1771. end = i.max()
  1772. n = (end - start) / 10 # 生成10条柱子
  1773. x_axis = [] # x轴
  1774. iter_num = 0 # 迭代到第n个
  1775. while iter_num <= 9: # 把每个特征分为10类进行迭代
  1776. # x_axis添加数据
  1777. x_axis.append(
  1778. f"({iter_num})[{round(start, 2)}-{round((start + n) if (start + n) <= end or not iter_num == 9 else end, 2)}]"
  1779. )
  1780. try:
  1781. if iter_num == 9:
  1782. raise Exception # 执行到第10次时,直接获取剩下的所有
  1783. s = (start <= i) == (i < end) # 布尔索引
  1784. except BaseException: # 因为start + n有超出end的风险
  1785. s = (start <= i) == (i <= end) # 布尔索引
  1786. # n_data = i[s] # 取得现在的特征数据
  1787. for num in range(len(class_list)): # 根据类别进行迭代
  1788. # 取得布尔数组:y_data == n_class也就是输出值为指定类型的bool矩阵,用于切片
  1789. now_class: list = class_list[num]
  1790. # 切片成和n_data一样的位置一样的形状(now_class就是一个bool矩阵)
  1791. bool_class = now_class[s].ravel()
  1792. # 用len计数 c_list = [[class1的数据],[class2的数据],[]]
  1793. c_list[num][iter_num] = int(np.sum(bool_class))
  1794. iter_num += 1
  1795. start += n
  1796. else:
  1797. iter_np = np.unique(i)
  1798. # 存放绘图数据,每一层列表是一个类(leg),第二层是每个x_data
  1799. c_list = [[0] * len(iter_np) for _ in class_list]
  1800. x_axis = [] # 添加x轴数据
  1801. for i_num in range(len(iter_np)): # 迭代每一个i(不重复)
  1802. i_data = iter_np[i_num]
  1803. # n_data= i[i == i_data]#取得现在特征数据
  1804. x_axis.append(f"[{i_data}]")
  1805. for num in range(len(class_list)): # 根据类别进行迭代
  1806. now_class = class_list[num] # 取得class_list的布尔数组
  1807. # 切片成和n_data一样的位置一样的形状(now_class就是一个bool矩阵)
  1808. bool_class = now_class[i == i_data]
  1809. # 用len计数 c_list = [[class1的数据],[class2的数据],[]]
  1810. c_list[num][i_num] = int(np.sum(bool_class).tolist())
  1811. c = (
  1812. Bar()
  1813. .add_xaxis(x_axis)
  1814. .set_global_opts(
  1815. title_opts=opts.TitleOpts(title="类型-特征统计柱状图"),
  1816. **global_setting,
  1817. xaxis_opts=opts.AxisOpts(type_="category"),
  1818. yaxis_opts=opts.AxisOpts(type_="value"),
  1819. )
  1820. )
  1821. y_axis = []
  1822. for i in range(len(c_list)):
  1823. y_axis.append(f"{class_[i]}")
  1824. c.add_yaxis(f"{class_[i]}", c_list[i], **label_setting)
  1825. des_to_csv(save_dir, f"类型-[{num_i}]特征统计柱状图", c_list, x_axis, y_axis)
  1826. tab.add(c, f"类型-[{num_i}]特征统计柱状图")
  1827. # 未完成
  1828. save = save_dir + r"/特征统计.HTML"
  1829. tab.render(save) # 生成HTML
  1830. return (save,)
  1831. class NumpyHeatMap(ToPyebase): # Numpy矩阵绘制热力图
  1832. def data_visualization(self, save_dir, *args, **kwargs):
  1833. tab = Tab()
  1834. data = self.x_traindata
  1835. x = [f"横[{i}]" for i in range(len(data))]
  1836. y = [f"纵[{i}]" for i in range(len(data[0]))]
  1837. value = [
  1838. (f"横[{i}]", f"纵[{j}]", float(data[i][j]))
  1839. for i in range(len(data))
  1840. for j in range(len(data[i]))
  1841. ]
  1842. c = (
  1843. HeatMap()
  1844. .add_xaxis(x)
  1845. .add_yaxis(f"数据", y, value, **label_setting) # value的第一个数值是x
  1846. .set_global_opts(
  1847. title_opts=opts.TitleOpts(title="矩阵热力图"),
  1848. **global_not_legend,
  1849. yaxis_opts=opts.AxisOpts(is_scale=True, type_="category"), # 'category'
  1850. xaxis_opts=opts.AxisOpts(is_scale=True, type_="category"),
  1851. visualmap_opts=opts.VisualMapOpts(
  1852. is_show=True,
  1853. max_=float(data.max()),
  1854. min_=float(data.min()),
  1855. pos_right="3%",
  1856. ),
  1857. ) # 显示
  1858. )
  1859. tab.add(c, "矩阵热力图")
  1860. tab.add(make_tab(x, data.transpose.tolist()), f"矩阵热力图:表格")
  1861. save = save_dir + r"/矩阵热力图.HTML"
  1862. tab.render(save) # 生成HTML
  1863. return (save,)
  1864. class PredictiveHeatmapBase(ToPyebase): # 绘制预测型热力图
  1865. def __init__(
  1866. self, args_use, learner, *args, **kwargs
  1867. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  1868. super(PredictiveHeatmapBase, self).__init__(args_use, learner, *args, **kwargs)
  1869. self.model = learner.Model
  1870. self.select_model = None
  1871. self.have_fit = learner.have_Fit
  1872. self.model_Name = "Select_Model"
  1873. self.learner = learner
  1874. self.x_traindata = learner.x_traindata.copy()
  1875. self.y_traindata = learner.y_traindata.copy()
  1876. self.means = []
  1877. def fit_model(self, x_data, *args, **kwargs):
  1878. try:
  1879. self.means = x_data.ravel()
  1880. except BaseException:
  1881. pass
  1882. self.have_fit = True
  1883. return "None", "None"
  1884. def data_visualization(
  1885. self,
  1886. save_dir,
  1887. decision_boundary_func=None,
  1888. prediction_boundary_func=None,
  1889. *args,
  1890. **kwargs,
  1891. ):
  1892. tab = Tab()
  1893. y = self.y_traindata
  1894. x_data = self.x_traindata
  1895. try: # 如果没有class
  1896. class_ = self.model.classes_.tolist()
  1897. class_heard = [f"类别[{i}]" for i in range(len(class_))]
  1898. # 获取数据
  1899. get, x_means, x_range, data_type = training_visualization(x_data, class_, y)
  1900. # 可使用自带的means,并且nan表示跳过
  1901. for i in range(min([len(x_means), len(self.means)])):
  1902. try:
  1903. g = self.means[i]
  1904. if g == np.nan:
  1905. raise Exception
  1906. x_means[i] = g
  1907. except BaseException:
  1908. pass
  1909. get = decision_boundary_func(
  1910. x_range, x_means, self.learner.predict, class_, data_type
  1911. )
  1912. for i in range(len(get)):
  1913. tab.add(get[i], f"{i}预测热力图")
  1914. heard = class_heard + [f"普适预测第{i}特征" for i in range(len(x_means))]
  1915. data = class_ + [f"{i}" for i in x_means]
  1916. c = Table().add(headers=heard, rows=[data])
  1917. tab.add(c, "数据表")
  1918. except BaseException:
  1919. get, x_means, x_range, data_type = regress_visualization(x_data, y)
  1920. get = prediction_boundary_func(
  1921. x_range, x_means, self.learner.predict, data_type
  1922. )
  1923. for i in range(len(get)):
  1924. tab.add(get[i], f"{i}预测热力图")
  1925. heard = [f"普适预测第{i}特征" for i in range(len(x_means))]
  1926. data = [f"{i}" for i in x_means]
  1927. c = Table().add(headers=heard, rows=[data])
  1928. tab.add(c, "数据表")
  1929. save = save_dir + r"/预测热力图.HTML"
  1930. tab.render(save) # 生成HTML
  1931. return (save,)
  1932. class PredictiveHeatmap(PredictiveHeatmapBase): # 绘制预测型热力图
  1933. def data_visualization(self, save_dir, *args, **kwargs):
  1934. return super().data_visualization(
  1935. save_dir, decision_boundary, prediction_boundary
  1936. )
  1937. class PredictiveHeatmapMore(PredictiveHeatmapBase): # 绘制预测型热力图_More
  1938. def data_visualization(self, save_dir, *args, **kwargs):
  1939. return super().data_visualization(
  1940. save_dir, decision_boundary_more, prediction_boundary_more
  1941. )
  1942. class NearFeatureScatterClassMore(ToPyebase):
  1943. def data_visualization(self, save_dir, *args, **kwargs):
  1944. tab = Tab()
  1945. x_data = self.x_traindata
  1946. y = self.y_traindata
  1947. class_ = np.unique(y).ravel().tolist()
  1948. class_heard = [f"簇[{i}]" for i in range(len(class_))]
  1949. get, x_means, x_range, data_type = training_visualization_more_no_center(
  1950. x_data, class_, y
  1951. )
  1952. for i in range(len(get)):
  1953. tab.add(get[i], f"{i}训练数据散点图")
  1954. heard = class_heard + [f"普适预测第{i}特征" for i in range(len(x_means))]
  1955. data = class_ + [f"{i}" for i in x_means]
  1956. c = Table().add(headers=heard, rows=[data])
  1957. tab.add(c, "数据表")
  1958. save = save_dir + r"/数据特征散点图(分类).HTML"
  1959. tab.render(save) # 生成HTML
  1960. return (save,)
  1961. class NearFeatureScatterMore(ToPyebase):
  1962. def data_visualization(self, save_dir, *args, **kwargs):
  1963. tab = Tab()
  1964. x_data = self.x_traindata
  1965. x_means = quick_stats(x_data).get()[0]
  1966. get_y = feature_visualization(x_data, "数据散点图") # 转换
  1967. for i in range(len(get_y)):
  1968. tab.add(get_y[i], f"[{i}]数据x-x散点图")
  1969. heard = [f"普适预测第{i}特征" for i in range(len(x_means))]
  1970. data = [f"{i}" for i in x_means]
  1971. c = Table().add(headers=heard, rows=[data])
  1972. tab.add(c, "数据表")
  1973. save = save_dir + r"/数据特征散点图.HTML"
  1974. tab.render(save) # 生成HTML
  1975. return (save,)
  1976. class NearFeatureScatterClass(ToPyebase): # 临近特征散点图:分类数据
  1977. def data_visualization(self, save_dir, *args, **kwargs):
  1978. # 获取数据
  1979. class_ = np.unique(self.y_traindata).ravel().tolist()
  1980. class_heard = [f"类别[{i}]" for i in range(len(class_))]
  1981. tab = Tab()
  1982. y = self.y_traindata
  1983. x_data = self.x_traindata
  1984. get, x_means, x_range, data_type = training_visualization(x_data, class_, y)
  1985. for i in range(len(get)):
  1986. tab.add(get[i], f"{i}临近特征散点图")
  1987. heard = class_heard + [f"普适预测第{i}特征" for i in range(len(x_means))]
  1988. data = class_ + [f"{i}" for i in x_means]
  1989. c = Table().add(headers=heard, rows=[data])
  1990. tab.add(c, "数据表")
  1991. save = save_dir + r"/临近数据特征散点图(分类).HTML"
  1992. tab.render(save) # 生成HTML
  1993. return (save,)
  1994. class NearFeatureScatter(ToPyebase): # 临近特征散点图:连续数据
  1995. def data_visualization(self, save_dir, *args, **kwargs):
  1996. tab = Tab()
  1997. x_data = self.x_traindata.transpose
  1998. get, x_means, x_range, data_type = training_visualization_no_class(x_data)
  1999. for i in range(len(get)):
  2000. tab.add(get[i], f"{i}临近特征散点图")
  2001. columns = [f"普适预测第{i}特征" for i in range(len(x_means))]
  2002. data = [f"{i}" for i in x_means]
  2003. tab.add(make_tab(columns, [data]), "数据表")
  2004. save = save_dir + r"/临近数据特征散点图.HTML"
  2005. tab.render(save) # 生成HTML
  2006. return (save,)
  2007. class FeatureScatterYX(ToPyebase): # y-x图
  2008. def data_visualization(self, save_dir, *args, **kwargs):
  2009. tab = Tab()
  2010. x_data = self.x_traindata
  2011. y = self.y_traindata
  2012. get, x_means, x_range, data_type = regress_visualization(x_data, y)
  2013. for i in range(len(get)):
  2014. tab.add(get[i], f"{i}特征x-y散点图")
  2015. columns = [f"普适预测第{i}特征" for i in range(len(x_means))]
  2016. data = [f"{i}" for i in x_means]
  2017. tab.add(make_tab(columns, [data]), "数据表")
  2018. save = save_dir + r"/特征y-x图像.HTML"
  2019. tab.render(save) # 生成HTML
  2020. return (save,)
  2021. class LineModel(StudyMachinebase):
  2022. def __init__(
  2023. self, args_use, model, *args, **kwargs
  2024. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2025. super(LineModel, self).__init__(*args, **kwargs)
  2026. model = {"Line": LinearRegression, "Ridge": Ridge, "Lasso": Lasso}[model]
  2027. if model == "Line":
  2028. self.model = model()
  2029. self.k = {}
  2030. else:
  2031. self.model = model(alpha=args_use["alpha"], max_iter=args_use["max_iter"])
  2032. self.k = {"alpha": args_use["alpha"], "max_iter": args_use["max_iter"]}
  2033. # 记录这两个是为了克隆
  2034. self.Alpha = args_use["alpha"]
  2035. self.max_iter = args_use["max_iter"]
  2036. self.model_Name = model
  2037. def data_visualization(self, save_dir, *args, **kwargs):
  2038. tab = Tab()
  2039. x_data = self.x_traindata
  2040. y = self.y_traindata
  2041. w_list = self.model.coef_.tolist()
  2042. w_heard = [f"系数w[{i}]" for i in range(len(w_list))]
  2043. b = self.model.intercept_.tolist()
  2044. get, x_means, x_range, data_type = regress_visualization(x_data, y)
  2045. get_line = regress_w(x_data, w_list, b, x_means.copy())
  2046. for i in range(len(get)):
  2047. tab.add(get[i].overlap(get_line[i]), f"{i}预测类型图")
  2048. get = prediction_boundary(x_range, x_means, self.predict, data_type)
  2049. for i in range(len(get)):
  2050. tab.add(get[i], f"{i}预测热力图")
  2051. tab.add(coefficient_scatter_plot(w_heard, w_list), "系数w散点图")
  2052. tab.add(coefficient_bar_plot(w_heard, self.model.coef_), "系数柱状图")
  2053. columns = [f"普适预测第{i}特征" for i in range(len(x_means))] + w_heard + ["截距b"]
  2054. data = [f"{i}" for i in x_means] + w_list + [b]
  2055. if self.model_Name != "Line":
  2056. columns += ["阿尔法", "最大迭代次数"]
  2057. data += [self.model.alpha, self.model.max_iter]
  2058. tab.add(make_tab(columns, [data]), "数据表")
  2059. des_to_csv(
  2060. save_dir,
  2061. "系数表",
  2062. [w_list] + [b],
  2063. [f"系数W[{i}]" for i in range(len(w_list))] + ["截距"],
  2064. )
  2065. des_to_csv(
  2066. save_dir,
  2067. "预测表",
  2068. [[f"{i}" for i in x_means]],
  2069. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2070. )
  2071. save = save_dir + r"/线性回归模型.HTML"
  2072. tab.render(save) # 生成HTML
  2073. return (save,)
  2074. class LogisticregressionModel(StudyMachinebase):
  2075. def __init__(
  2076. self, args_use, model, *args, **kwargs
  2077. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2078. super(LogisticregressionModel, self).__init__(*args, **kwargs)
  2079. self.model = LogisticRegression(C=args_use["C"], max_iter=args_use["max_iter"])
  2080. # 记录这两个是为了克隆
  2081. self.C = args_use["C"]
  2082. self.max_iter = args_use["max_iter"]
  2083. self.k = {"C": args_use["C"], "max_iter": args_use["max_iter"]}
  2084. self.model_Name = model
  2085. def data_visualization(self, save_dir="render.html", *args, **kwargs):
  2086. # 获取数据
  2087. w_array = self.model.coef_
  2088. w_list = w_array.tolist() # 变为表格
  2089. b = self.model.intercept_
  2090. c = self.model.C
  2091. max_iter = self.model.max_iter
  2092. class_ = self.model.classes_.tolist()
  2093. class_heard = [f"类别[{i}]" for i in range(len(class_))]
  2094. tab = Tab()
  2095. y = self.y_traindata
  2096. x_data = self.x_traindata
  2097. get, x_means, x_range, data_type = training_visualization(x_data, class_, y)
  2098. get_line = training_w(x_data, class_, y, w_list, b, x_means.copy())
  2099. for i in range(len(get)):
  2100. tab.add(get[i].overlap(get_line[i]), f"{i}决策边界散点图")
  2101. for i in range(len(w_list)):
  2102. w = w_list[i]
  2103. w_heard = [f"系数w[{i},{j}]" for j in range(len(w))]
  2104. tab.add(coefficient_scatter_plot(w_heard, w), f"系数w[{i}]散点图")
  2105. tab.add(coefficient_bar_plot(w_heard, w_array[i]), f"系数w[{i}]柱状图")
  2106. columns = class_heard + [f"截距{i}" for i in range(len(b))] + ["C", "最大迭代数"]
  2107. data = class_ + b.tolist() + [c, max_iter]
  2108. c = Table().add(headers=columns, rows=[data])
  2109. tab.add(c, "数据表")
  2110. c = Table().add(
  2111. headers=[f"系数W[{i}]" for i in range(len(w_list[0]))], rows=w_list
  2112. )
  2113. tab.add(c, "系数数据表")
  2114. c = Table().add(
  2115. headers=[f"普适预测第{i}特征" for i in range(len(x_means))],
  2116. rows=[[f"{i}" for i in x_means]],
  2117. )
  2118. tab.add(c, "普适预测数据表")
  2119. des_to_csv(
  2120. save_dir, "系数表", w_list, [f"系数W[{i}]" for i in range(len(w_list[0]))]
  2121. )
  2122. des_to_csv(save_dir, "截距表", [b], [f"截距{i}" for i in range(len(b))])
  2123. des_to_csv(
  2124. save_dir,
  2125. "预测表",
  2126. [[f"{i}" for i in x_means]],
  2127. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2128. )
  2129. save = save_dir + r"/逻辑回归.HTML"
  2130. tab.render(save) # 生成HTML
  2131. return (save,)
  2132. class CategoricalData: # 数据统计助手
  2133. def __init__(self):
  2134. self.x_means = []
  2135. self.x_range = []
  2136. self.data_type = []
  2137. def __call__(self, x1, *args, **kwargs):
  2138. get = self.is_continuous(x1)
  2139. return get
  2140. def is_continuous(self, x1: np.array):
  2141. try:
  2142. x1_con = is_continuous(x1)
  2143. if x1_con:
  2144. self.x_means.append(np.mean(x1))
  2145. self.add_range(x1)
  2146. else:
  2147. raise Exception
  2148. return x1_con
  2149. except BaseException: # 找出出现次数最多的元素
  2150. new = np.unique(x1) # 去除相同的元素
  2151. count_list = []
  2152. for i in new:
  2153. count_list.append(np.sum(x1 == i))
  2154. index = count_list.index(max(count_list)) # 找出最大值的索引
  2155. self.x_means.append(x1[index])
  2156. self.add_range(x1, False)
  2157. return False
  2158. def add_range(self, x1: np.array, range_=True):
  2159. try:
  2160. if not range_:
  2161. raise Exception
  2162. min_ = int(x1.min()) - 1
  2163. max_ = int(x1.max()) + 1
  2164. # 不需要复制列表
  2165. self.x_range.append([min_, max_])
  2166. self.data_type.append(1)
  2167. except BaseException:
  2168. self.x_range.append(list(set(x1.tolist()))) # 去除多余元素
  2169. self.data_type.append(2)
  2170. def get(self):
  2171. return self.x_means, self.x_range, self.data_type
  2172. class KnnModel(StudyMachinebase):
  2173. def __init__(
  2174. self, args_use, model, *args, **kwargs
  2175. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2176. super(KnnModel, self).__init__(*args, **kwargs)
  2177. model = {"Knn_class": KNeighborsClassifier, "Knn": KNeighborsRegressor}[model]
  2178. self.model = model(p=args_use["p"], n_neighbors=args_use["n_neighbors"])
  2179. # 记录这两个是为了克隆
  2180. self.n_neighbors = args_use["n_neighbors"]
  2181. self.p = args_use["p"]
  2182. self.k = {"n_neighbors": args_use["n_neighbors"], "p": args_use["p"]}
  2183. self.model_Name = model
  2184. def data_visualization(self, save_dir, *args, **kwargs):
  2185. tab = Tab()
  2186. y = self.y_traindata
  2187. x_data = self.x_traindata
  2188. y_test = self.y_testdata
  2189. x_test = self.x_testdata
  2190. if self.model_Name == "Knn_class":
  2191. class_ = self.model.classes_.tolist()
  2192. class_heard = [f"类别[{i}]" for i in range(len(class_))]
  2193. get, x_means, x_range, data_type = training_visualization(x_data, class_, y)
  2194. for i in range(len(get)):
  2195. tab.add(get[i], f"{i}训练数据散点图")
  2196. if y_test is not None:
  2197. get = training_visualization(x_test, class_, y_test)[0]
  2198. for i in range(len(get)):
  2199. tab.add(get[i], f"{i}测试数据散点图")
  2200. get = decision_boundary(x_range, x_means, self.predict, class_, data_type)
  2201. for i in range(len(get)):
  2202. tab.add(get[i], f"{i}预测热力图")
  2203. heard = class_heard + [f"普适预测第{i}特征" for i in range(len(x_means))]
  2204. data = class_ + [f"{i}" for i in x_means]
  2205. c = Table().add(headers=heard, rows=[data])
  2206. tab.add(c, "数据表")
  2207. else:
  2208. get, x_means, x_range, data_type = regress_visualization(x_data, y)
  2209. for i in range(len(get)):
  2210. tab.add(get[i], f"{i}训练数据散点图")
  2211. get = regress_visualization(x_test, y_test)[0]
  2212. for i in range(len(get)):
  2213. tab.add(get[i], f"{i}测试数据类型图")
  2214. get = prediction_boundary(x_range, x_means, self.predict, data_type)
  2215. for i in range(len(get)):
  2216. tab.add(get[i], f"{i}预测热力图")
  2217. heard = [f"普适预测第{i}特征" for i in range(len(x_means))]
  2218. data = [f"{i}" for i in x_means]
  2219. c = Table().add(headers=heard, rows=[data])
  2220. tab.add(c, "数据表")
  2221. des_to_csv(
  2222. save_dir,
  2223. "预测表",
  2224. [[f"{i}" for i in x_means]],
  2225. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2226. )
  2227. save = save_dir + r"/K.HTML"
  2228. tab.render(save) # 生成HTML
  2229. return (save,)
  2230. class TreeModel(StudyMachinebase):
  2231. def __init__(
  2232. self, args_use, model, *args, **kwargs
  2233. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2234. super(TreeModel, self).__init__(*args, **kwargs)
  2235. model = {"Tree_class": DecisionTreeClassifier, "Tree": DecisionTreeRegressor}[
  2236. model
  2237. ]
  2238. self.model = model(
  2239. criterion=args_use["criterion"],
  2240. splitter=args_use["splitter"],
  2241. max_features=args_use["max_features"],
  2242. max_depth=args_use["max_depth"],
  2243. min_samples_split=args_use["min_samples_split"],
  2244. )
  2245. # 记录这两个是为了克隆
  2246. self.criterion = args_use["criterion"]
  2247. self.splitter = args_use["splitter"]
  2248. self.max_features = args_use["max_features"]
  2249. self.max_depth = args_use["max_depth"]
  2250. self.min_samples_split = args_use["min_samples_split"]
  2251. self.k = {
  2252. "criterion": args_use["criterion"],
  2253. "splitter": args_use["splitter"],
  2254. "max_features": args_use["max_features"],
  2255. "max_depth": args_use["max_depth"],
  2256. "min_samples_split": args_use["min_samples_split"],
  2257. }
  2258. self.model_Name = model
  2259. def data_visualization(self, save_dir, *args, **kwargs):
  2260. tab = Tab()
  2261. importance = self.model.feature_importances_.tolist()
  2262. with open(save_dir + r"\Tree_Gra.dot", "w") as f:
  2263. export_graphviz(self.model, out_file=f)
  2264. make_bar("特征重要性", importance, tab)
  2265. des_to_csv(
  2266. save_dir,
  2267. "特征重要性",
  2268. [importance],
  2269. [f"[{i}]特征" for i in range(len(importance))],
  2270. )
  2271. tab.add(see_tree(save_dir + r"\Tree_Gra.dot"), "决策树可视化")
  2272. y = self.y_traindata
  2273. x_data = self.x_traindata
  2274. y_test = self.y_testdata
  2275. x_test = self.x_testdata
  2276. if self.model_Name == "Tree_class":
  2277. class_ = self.model.classes_.tolist()
  2278. class_heard = [f"类别[{i}]" for i in range(len(class_))]
  2279. get, x_means, x_range, data_type = training_visualization(x_data, class_, y)
  2280. for i in range(len(get)):
  2281. tab.add(get[i], f"{i}训练数据散点图")
  2282. get = training_visualization(x_test, class_, y_test)[0]
  2283. for i in range(len(get)):
  2284. tab.add(get[i], f"{i}测试数据散点图")
  2285. get = decision_boundary(x_range, x_means, self.predict, class_, data_type)
  2286. for i in range(len(get)):
  2287. tab.add(get[i], f"{i}预测热力图")
  2288. tab.add(
  2289. make_tab(
  2290. class_heard
  2291. + [f"普适预测第{i}特征" for i in range(len(x_means))]
  2292. + [f"特征{i}重要性" for i in range(len(importance))],
  2293. [class_ + [f"{i}" for i in x_means] + importance],
  2294. ),
  2295. "数据表",
  2296. )
  2297. else:
  2298. get, x_means, x_range, data_type = regress_visualization(x_data, y)
  2299. for i in range(len(get)):
  2300. tab.add(get[i], f"{i}训练数据散点图")
  2301. get = regress_visualization(x_test, y_test)[0]
  2302. for i in range(len(get)):
  2303. tab.add(get[i], f"{i}测试数据类型图")
  2304. get = prediction_boundary(x_range, x_means, self.predict, data_type)
  2305. for i in range(len(get)):
  2306. tab.add(get[i], f"{i}预测热力图")
  2307. tab.add(
  2308. make_tab(
  2309. [f"普适预测第{i}特征" for i in range(len(x_means))]
  2310. + [f"特征{i}重要性" for i in range(len(importance))],
  2311. [[f"{i}" for i in x_means] + importance],
  2312. ),
  2313. "数据表",
  2314. )
  2315. des_to_csv(
  2316. save_dir,
  2317. "预测表",
  2318. [[f"{i}" for i in x_means]],
  2319. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2320. )
  2321. save = save_dir + r"/决策树.HTML"
  2322. tab.render(save) # 生成HTML
  2323. return (save,)
  2324. class ForestModel(StudyMachinebase):
  2325. def __init__(
  2326. self, args_use, model, *args, **kwargs
  2327. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2328. super(ForestModel, self).__init__(*args, **kwargs)
  2329. model = {
  2330. "Forest_class": RandomForestClassifier,
  2331. "Forest": RandomForestRegressor,
  2332. }[model]
  2333. self.model = model(
  2334. n_estimators=args_use["n_Tree"],
  2335. criterion=args_use["criterion"],
  2336. max_features=args_use["max_features"],
  2337. max_depth=args_use["max_depth"],
  2338. min_samples_split=args_use["min_samples_split"],
  2339. )
  2340. # 记录这两个是为了克隆
  2341. self.n_estimators = args_use["n_Tree"]
  2342. self.criterion = args_use["criterion"]
  2343. self.max_features = args_use["max_features"]
  2344. self.max_depth = args_use["max_depth"]
  2345. self.min_samples_split = args_use["min_samples_split"]
  2346. self.k = {
  2347. "n_estimators": args_use["n_Tree"],
  2348. "criterion": args_use["criterion"],
  2349. "max_features": args_use["max_features"],
  2350. "max_depth": args_use["max_depth"],
  2351. "min_samples_split": args_use["min_samples_split"],
  2352. }
  2353. self.model_Name = model
  2354. def data_visualization(self, save_dir, *args, **kwargs):
  2355. tab = Tab()
  2356. # 多个决策树可视化
  2357. for i in range(len(self.model.estimators_)):
  2358. with open(save_dir + rf"\Tree_Gra[{i}].dot", "w") as f:
  2359. export_graphviz(self.model.estimators_[i], out_file=f)
  2360. tab.add(see_tree(save_dir + rf"\Tree_Gra[{i}].dot"), f"[{i}]决策树可视化")
  2361. y = self.y_traindata
  2362. x_data = self.x_traindata
  2363. if self.model_Name == "Forest_class":
  2364. class_ = self.model.classes_.tolist()
  2365. class_heard = [f"类别[{i}]" for i in range(len(class_))]
  2366. get, x_means, x_range, data_type = training_visualization(x_data, class_, y)
  2367. for i in range(len(get)):
  2368. tab.add(get[i], f"{i}训练数据散点图")
  2369. get = decision_boundary(x_range, x_means, self.predict, class_, data_type)
  2370. for i in range(len(get)):
  2371. tab.add(get[i], f"{i}预测热力图")
  2372. tab.add(
  2373. make_tab(
  2374. class_heard + [f"普适预测第{i}特征" for i in range(len(x_means))],
  2375. [class_ + [f"{i}" for i in x_means]],
  2376. ),
  2377. "数据表",
  2378. )
  2379. else:
  2380. get, x_means, x_range, data_type = regress_visualization(x_data, y)
  2381. for i in range(len(get)):
  2382. tab.add(get[i], f"{i}预测类型图")
  2383. get = prediction_boundary(x_range, x_means, self.predict, data_type)
  2384. for i in range(len(get)):
  2385. tab.add(get[i], f"{i}预测热力图")
  2386. tab.add(
  2387. make_tab(
  2388. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2389. [[f"{i}" for i in x_means]],
  2390. ),
  2391. "数据表",
  2392. )
  2393. des_to_csv(
  2394. save_dir,
  2395. "预测表",
  2396. [[f"{i}" for i in x_means]],
  2397. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2398. )
  2399. save = save_dir + r"/随机森林.HTML"
  2400. tab.render(save) # 生成HTML
  2401. return (save,)
  2402. class GradienttreeModel(StudyMachinebase): # 继承Tree_Model主要是继承Des
  2403. def __init__(
  2404. self, args_use, model, *args, **kwargs
  2405. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2406. super(GradienttreeModel, self).__init__(*args, **kwargs) # 不需要执行Tree_Model的初始化
  2407. model = {
  2408. "GradientTree_class": GradientBoostingClassifier,
  2409. "GradientTree": GradientBoostingRegressor,
  2410. }[model]
  2411. self.model = model(
  2412. n_estimators=args_use["n_Tree"],
  2413. max_features=args_use["max_features"],
  2414. max_depth=args_use["max_depth"],
  2415. min_samples_split=args_use["min_samples_split"],
  2416. )
  2417. # 记录这两个是为了克隆
  2418. self.criterion = args_use["criterion"]
  2419. self.splitter = args_use["splitter"]
  2420. self.max_features = args_use["max_features"]
  2421. self.max_depth = args_use["max_depth"]
  2422. self.min_samples_split = args_use["min_samples_split"]
  2423. self.k = {
  2424. "criterion": args_use["criterion"],
  2425. "splitter": args_use["splitter"],
  2426. "max_features": args_use["max_features"],
  2427. "max_depth": args_use["max_depth"],
  2428. "min_samples_split": args_use["min_samples_split"],
  2429. }
  2430. self.model_Name = model
  2431. def data_visualization(self, save_dir, *args, **kwargs):
  2432. tab = Tab()
  2433. # 多个决策树可视化
  2434. for a in range(len(self.model.estimators_)):
  2435. for i in range(len(self.model.estimators_[a])):
  2436. with open(save_dir + rf"\Tree_Gra[{a},{i}].dot", "w") as f:
  2437. export_graphviz(self.model.estimators_[a][i], out_file=f)
  2438. tab.add(
  2439. see_tree(save_dir + rf"\Tree_Gra[{a},{i}].dot"), f"[{a},{i}]决策树可视化"
  2440. )
  2441. y = self.y_traindata
  2442. x_data = self.x_traindata
  2443. if self.model_Name == "Tree_class":
  2444. class_ = self.model.classes_.tolist()
  2445. class_heard = [f"类别[{i}]" for i in range(len(class_))]
  2446. get, x_means, x_range, data_type = training_visualization(x_data, class_, y)
  2447. for i in range(len(get)):
  2448. tab.add(get[i], f"{i}训练数据散点图")
  2449. get = decision_boundary(x_range, x_means, self.predict, class_, data_type)
  2450. for i in range(len(get)):
  2451. tab.add(get[i], f"{i}预测热力图")
  2452. tab.add(
  2453. make_tab(
  2454. class_heard + [f"普适预测第{i}特征" for i in range(len(x_means))],
  2455. [class_ + [f"{i}" for i in x_means]],
  2456. ),
  2457. "数据表",
  2458. )
  2459. else:
  2460. get, x_means, x_range, data_type = regress_visualization(x_data, y)
  2461. for i in range(len(get)):
  2462. tab.add(get[i], f"{i}预测类型图")
  2463. get = prediction_boundary(x_range, x_means, self.predict, data_type)
  2464. for i in range(len(get)):
  2465. tab.add(get[i], f"{i}预测热力图")
  2466. tab.add(
  2467. make_tab(
  2468. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2469. [[f"{i}" for i in x_means]],
  2470. ),
  2471. "数据表",
  2472. )
  2473. des_to_csv(
  2474. save_dir,
  2475. "预测表",
  2476. [[f"{i}" for i in x_means]],
  2477. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2478. )
  2479. save = save_dir + r"/梯度提升回归树.HTML"
  2480. tab.render(save) # 生成HTML
  2481. return (save,)
  2482. class SvcModel(StudyMachinebase):
  2483. def __init__(
  2484. self, args_use, model, *args, **kwargs
  2485. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2486. super(SvcModel, self).__init__(*args, **kwargs)
  2487. self.model = SVC(
  2488. C=args_use["C"], gamma=args_use["gamma"], kernel=args_use["kernel"]
  2489. )
  2490. # 记录这两个是为了克隆
  2491. self.C = args_use["C"]
  2492. self.gamma = args_use["gamma"]
  2493. self.kernel = args_use["kernel"]
  2494. self.k = {
  2495. "C": args_use["C"],
  2496. "gamma": args_use["gamma"],
  2497. "kernel": args_use["kernel"],
  2498. }
  2499. self.model_Name = model
  2500. def data_visualization(self, save_dir, *args, **kwargs):
  2501. tab = Tab()
  2502. try:
  2503. w_list = self.model.coef_.tolist() # 未必有这个属性
  2504. b = self.model.intercept_.tolist()
  2505. have_w = True
  2506. except BaseException:
  2507. have_w = False
  2508. class_ = self.model.classes_.tolist()
  2509. class_heard = [f"类别[{i}]" for i in range(len(class_))]
  2510. y = self.y_traindata
  2511. x_data = self.x_traindata
  2512. get, x_means, x_range, data_type = training_visualization(x_data, class_, y)
  2513. if have_w:
  2514. get_line: list = training_w(x_data, class_, y, w_list, b, x_means.copy())
  2515. for i in range(len(get)):
  2516. if have_w:
  2517. tab.add(get[i].overlap(get_line[i]), f"{i}决策边界散点图")
  2518. else:
  2519. tab.add(get[i], f"{i}决策边界散点图")
  2520. get = decision_boundary(x_range, x_means, self.predict, class_, data_type)
  2521. for i in range(len(get)):
  2522. tab.add(get[i], f"{i}预测热力图")
  2523. dic = {2: "离散", 1: "连续"}
  2524. tab.add(
  2525. make_tab(
  2526. class_heard
  2527. + [f"普适预测第{i}特征:{dic[data_type[i]]}" for i in range(len(x_means))],
  2528. [class_ + [f"{i}" for i in x_means]],
  2529. ),
  2530. "数据表",
  2531. )
  2532. if have_w:
  2533. des_to_csv(
  2534. save_dir, "系数表", w_list, [f"系数W[{i}]" for i in range(len(w_list[0]))]
  2535. )
  2536. if have_w:
  2537. des_to_csv(save_dir, "截距表", [b], [f"截距{i}" for i in range(len(b))])
  2538. des_to_csv(
  2539. save_dir,
  2540. "预测表",
  2541. [[f"{i}" for i in x_means]],
  2542. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2543. )
  2544. save = save_dir + r"/支持向量机分类.HTML"
  2545. tab.render(save) # 生成HTML
  2546. return (save,)
  2547. class SvrModel(StudyMachinebase):
  2548. def __init__(
  2549. self, args_use, model, *args, **kwargs
  2550. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2551. super(SvrModel, self).__init__(*args, **kwargs)
  2552. self.model = SVR(
  2553. C=args_use["C"], gamma=args_use["gamma"], kernel=args_use["kernel"]
  2554. )
  2555. # 记录这两个是为了克隆
  2556. self.C = args_use["C"]
  2557. self.gamma = args_use["gamma"]
  2558. self.kernel = args_use["kernel"]
  2559. self.k = {
  2560. "C": args_use["C"],
  2561. "gamma": args_use["gamma"],
  2562. "kernel": args_use["kernel"],
  2563. }
  2564. self.model_Name = model
  2565. def data_visualization(self, save_dir, *args, **kwargs):
  2566. tab = Tab()
  2567. x_data = self.x_traindata
  2568. y = self.y_traindata
  2569. try:
  2570. w_list = self.model.coef_.tolist() # 未必有这个属性
  2571. b = self.model.intercept_.tolist()
  2572. have_w = True
  2573. except BaseException:
  2574. have_w = False
  2575. get, x_means, x_range, data_type = regress_visualization(x_data, y)
  2576. if have_w:
  2577. get_line = regress_w(x_data, w_list, b, x_means.copy())
  2578. for i in range(len(get)):
  2579. if have_w:
  2580. tab.add(get[i].overlap(get_line[i]), f"{i}预测类型图")
  2581. else:
  2582. tab.add(get[i], f"{i}预测类型图")
  2583. get = prediction_boundary(x_range, x_means, self.predict, data_type)
  2584. for i in range(len(get)):
  2585. tab.add(get[i], f"{i}预测热力图")
  2586. if have_w:
  2587. des_to_csv(
  2588. save_dir, "系数表", w_list, [f"系数W[{i}]" for i in range(len(w_list[0]))]
  2589. )
  2590. if have_w:
  2591. des_to_csv(save_dir, "截距表", [b], [f"截距{i}" for i in range(len(b))])
  2592. des_to_csv(
  2593. save_dir,
  2594. "预测表",
  2595. [[f"{i}" for i in x_means]],
  2596. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2597. )
  2598. tab.add(
  2599. make_tab(
  2600. [f"普适预测第{i}特征" for i in range(len(x_means))],
  2601. [[f"{i}" for i in x_means]],
  2602. ),
  2603. "数据表",
  2604. )
  2605. save = save_dir + r"/支持向量机回归.HTML"
  2606. tab.render(save) # 生成HTML
  2607. return (save,)
  2608. class VarianceModel(Unsupervised): # 无监督
  2609. def __init__(
  2610. self, args_use, model, *args, **kwargs
  2611. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2612. super(VarianceModel, self).__init__(*args, **kwargs)
  2613. self.model = VarianceThreshold(threshold=(args_use["P"] * (1 - args_use["P"])))
  2614. # 记录这两个是为了克隆
  2615. self.threshold = args_use["P"]
  2616. self.k = {"threshold": args_use["P"]}
  2617. self.model_Name = model
  2618. def data_visualization(self, save_dir, *args, **kwargs):
  2619. tab = Tab()
  2620. var = self.model.variances_ # 标准差
  2621. y_data = self.y_testdata
  2622. if isinstance(y_data, np.ndarray):
  2623. get = feature_visualization(self.y_testdata)
  2624. for i in range(len(get)):
  2625. tab.add(get[i], f"[{i}]数据x-x散点图")
  2626. c = (
  2627. Bar()
  2628. .add_xaxis([f"[{i}]特征" for i in range(len(var))])
  2629. .add_yaxis("标准差", var.tolist(), **label_setting)
  2630. .set_global_opts(
  2631. title_opts=opts.TitleOpts(title="系数w柱状图"), **global_setting
  2632. )
  2633. )
  2634. tab.add(c, "数据标准差")
  2635. save = save_dir + r"/方差特征选择.HTML"
  2636. tab.render(save) # 生成HTML
  2637. return (save,)
  2638. class SelectkbestModel(PrepBase): # 有监督
  2639. def __init__(self, args_use, model, *args, **kwargs):
  2640. super(SelectkbestModel, self).__init__(*args, **kwargs)
  2641. self.model = SelectKBest(k=args_use["k"], score_func=args_use["score_func"])
  2642. # 记录这两个是为了克隆
  2643. self.k_ = args_use["k"]
  2644. self.score_func = args_use["score_func"]
  2645. self.k = {"k": args_use["k"], "score_func": args_use["score_func"]}
  2646. self.model_Name = model
  2647. def data_visualization(self, save_dir, *args, **kwargs):
  2648. tab = Tab()
  2649. score = self.model.scores_.tolist()
  2650. support = self.model.get_support()
  2651. y_data = self.y_traindata
  2652. x_data = self.x_traindata
  2653. if isinstance(x_data, np.ndarray):
  2654. get = feature_visualization(x_data)
  2655. for i in range(len(get)):
  2656. tab.add(get[i], f"[{i}]训练数据x-x散点图")
  2657. if isinstance(y_data, np.ndarray):
  2658. get = feature_visualization(y_data)
  2659. for i in range(len(get)):
  2660. tab.add(get[i], f"[{i}]保留训练数据x-x散点图")
  2661. y_data = self.y_testdata
  2662. x_data = self.x_testdata
  2663. if isinstance(x_data, np.ndarray):
  2664. get = feature_visualization(x_data)
  2665. for i in range(len(get)):
  2666. tab.add(get[i], f"[{i}]数据x-x散点图")
  2667. if isinstance(y_data, np.ndarray):
  2668. get = feature_visualization(y_data)
  2669. for i in range(len(get)):
  2670. tab.add(get[i], f"[{i}]保留数据x-x散点图")
  2671. choose = []
  2672. un_choose = []
  2673. for i in range(len(score)):
  2674. if support[i]:
  2675. choose.append(score[i])
  2676. un_choose.append(0) # 占位
  2677. else:
  2678. un_choose.append(score[i])
  2679. choose.append(0)
  2680. c = (
  2681. Bar()
  2682. .add_xaxis([f"[{i}]特征" for i in range(len(score))])
  2683. .add_yaxis("选中特征", choose, **label_setting)
  2684. .add_yaxis("抛弃特征", un_choose, **label_setting)
  2685. .set_global_opts(
  2686. title_opts=opts.TitleOpts(title="系数w柱状图"), **global_setting
  2687. )
  2688. )
  2689. tab.add(c, "单变量重要程度")
  2690. save = save_dir + r"/单一变量特征选择.HTML"
  2691. tab.render(save) # 生成HTML
  2692. return (save,)
  2693. class SelectFromModel(PrepBase): # 有监督
  2694. def __init__(
  2695. self, args_use, learner, *args, **kwargs
  2696. ): # model表示当前选用的模型类型,Alpha针对正则化的参数
  2697. super(SelectFromModel, self).__init__(*args, **kwargs)
  2698. self.model = learner.Model
  2699. self.Select_Model = SelectFromModel(
  2700. estimator=learner.Model, max_features=args_use["k"], prefit=learner.have_Fit
  2701. )
  2702. self.max_features = args_use["k"]
  2703. self.estimator = learner.Model
  2704. self.k = {
  2705. "max_features": args_use["k"],
  2706. "estimator": learner.Model,
  2707. "have_Fit": learner.have_Fit,
  2708. }
  2709. self.have_fit = learner.have_Fit
  2710. self.model_Name = "SelectFrom_Model"
  2711. self.learner = learner
  2712. def fit_model(self, x_data, y_data, split=0.3, *args, **kwargs):
  2713. y_data = y_data.ravel()
  2714. if not self.have_fit: # 不允许第二次训练
  2715. self.Select_Model.fit(x_data, y_data)
  2716. self.have_fit = True
  2717. return "None", "None"
  2718. def predict(self, x_data, *args, **kwargs):
  2719. try:
  2720. self.x_testdata = x_data.copy()
  2721. x_predict = self.Select_Model.transform(x_data)
  2722. self.y_testdata = x_predict.copy()
  2723. self.have_predict = True
  2724. return x_predict, "模型特征工程"
  2725. except BaseException:
  2726. self.have_predict = True
  2727. return np.array([]), "无结果工程"
  2728. def data_visualization(self, save_dir, *args, **kwargs):
  2729. tab = Tab()
  2730. support = self.Select_Model.get_support()
  2731. y_data = self.y_testdata
  2732. x_data = self.x_testdata
  2733. if isinstance(x_data, np.ndarray):
  2734. get = feature_visualization(x_data)
  2735. for i in range(len(get)):
  2736. tab.add(get[i], f"[{i}]数据x-x散点图")
  2737. if isinstance(y_data, np.ndarray):
  2738. get = feature_visualization(y_data)
  2739. for i in range(len(get)):
  2740. tab.add(get[i], f"[{i}]保留数据x-x散点图")
  2741. def make_bar(score):
  2742. choose = []
  2743. un_choose = []
  2744. for i in range(len(score)):
  2745. if support[i]:
  2746. choose.append(abs(score[i]))
  2747. un_choose.append(0) # 占位
  2748. else:
  2749. un_choose.append(abs(score[i]))
  2750. choose.append(0)
  2751. c = (
  2752. Bar()
  2753. .add_xaxis([f"[{i}]特征" for i in range(len(score))])
  2754. .add_yaxis("选中特征", choose, **label_setting)
  2755. .add_yaxis("抛弃特征", un_choose, **label_setting)
  2756. .set_global_opts(
  2757. title_opts=opts.TitleOpts(title="系数w柱状图"), **global_setting
  2758. )
  2759. )
  2760. tab.add(c, "单变量重要程度")
  2761. try:
  2762. make_bar(self.model.coef_)
  2763. except BaseException:
  2764. try:
  2765. make_bar(self.model.feature_importances_)
  2766. except BaseException:
  2767. pass
  2768. save = save_dir + r"/模型特征选择.HTML"
  2769. tab.render(save) # 生成HTML
  2770. return (save,)
  2771. class StandardizationModel(Unsupervised): # z-score标准化 无监督
  2772. def __init__(self, *args, **kwargs):
  2773. super(StandardizationModel, self).__init__(*args, **kwargs)
  2774. self.model = StandardScaler()
  2775. self.k = {}
  2776. self.model_Name = "StandardScaler"
  2777. def data_visualization(self, save_dir, *args, **kwargs):
  2778. tab = Tab()
  2779. y_data = self.y_testdata
  2780. x_data = self.x_testdata
  2781. var = self.model.var_.tolist()
  2782. means = self.model.mean_.tolist()
  2783. scale = self.model.scale_.tolist()
  2784. conversion_control(y_data, x_data, tab)
  2785. make_bar("标准差", var, tab)
  2786. make_bar("方差", means, tab)
  2787. make_bar("Scale", scale, tab)
  2788. save = save_dir + r"/z-score标准化.HTML"
  2789. tab.render(save) # 生成HTML
  2790. return (save,)
  2791. class MinmaxscalerModel(Unsupervised): # 离差标准化
  2792. def __init__(self, args_use, *args, **kwargs):
  2793. super(MinmaxscalerModel, self).__init__(*args, **kwargs)
  2794. self.model = MinMaxScaler(feature_range=args_use["feature_range"])
  2795. self.k = {}
  2796. self.model_Name = "MinMaxScaler"
  2797. def data_visualization(self, save_dir, *args, **kwargs):
  2798. tab = Tab()
  2799. y_data = self.y_testdata
  2800. x_data = self.x_testdata
  2801. scale = self.model.scale_.tolist()
  2802. max_ = self.model.data_max_.tolist()
  2803. min_ = self.model.data_min_.tolist()
  2804. conversion_control(y_data, x_data, tab)
  2805. make_bar("Scale", scale, tab)
  2806. tab.add(
  2807. make_tab(
  2808. heard=[f"[{i}]特征最大值" for i in range(len(max_))]
  2809. + [f"[{i}]特征最小值" for i in range(len(min_))],
  2810. row=[max_ + min_],
  2811. ),
  2812. "数据表格",
  2813. )
  2814. save = save_dir + r"/离差标准化.HTML"
  2815. tab.render(save) # 生成HTML
  2816. return (save,)
  2817. class LogscalerModel(PrepBase): # 对数标准化
  2818. def __init__(self, *args, **kwargs):
  2819. super(LogscalerModel, self).__init__(*args, **kwargs)
  2820. self.model = None
  2821. self.k = {}
  2822. self.model_Name = "LogScaler"
  2823. def fit_model(self, x_data, *args, **kwargs):
  2824. if not self.have_predict: # 不允许第二次训练
  2825. self.max_logx = np.log(x_data.max())
  2826. self.have_fit = True
  2827. return "None", "None"
  2828. def predict(self, x_data, *args, **kwargs):
  2829. try:
  2830. max_logx = self.max_logx
  2831. except BaseException:
  2832. self.have_fit = False
  2833. self.fit_model(x_data)
  2834. max_logx = self.max_logx
  2835. self.x_testdata = x_data.copy()
  2836. x_predict = np.log(x_data) / max_logx
  2837. self.y_testdata = x_predict.copy()
  2838. self.have_predict = True
  2839. return x_predict, "对数变换"
  2840. def data_visualization(self, save_dir, *args, **kwargs):
  2841. tab = Tab()
  2842. y_data = self.y_testdata
  2843. x_data = self.x_testdata
  2844. conversion_control(y_data, x_data, tab)
  2845. tab.add(make_tab(heard=["最大对数值(自然对数)"], row=[[str(self.max_logx)]]), "数据表格")
  2846. save = save_dir + r"/对数标准化.HTML"
  2847. tab.render(save) # 生成HTML
  2848. return (save,)
  2849. class AtanscalerModel(PrepBase): # atan标准化
  2850. def __init__(self, *args, **kwargs):
  2851. super(AtanscalerModel, self).__init__(*args, **kwargs)
  2852. self.model = None
  2853. self.k = {}
  2854. self.model_Name = "atanScaler"
  2855. def fit_model(self, x_data, *args, **kwargs):
  2856. self.have_fit = True
  2857. return "None", "None"
  2858. def predict(self, x_data, *args, **kwargs):
  2859. self.x_testdata = x_data.copy()
  2860. x_predict = np.arctan(x_data) * (2 / np.pi)
  2861. self.y_testdata = x_predict.copy()
  2862. self.have_predict = True
  2863. return x_predict, "atan变换"
  2864. def data_visualization(self, save_dir, *args, **kwargs):
  2865. tab = Tab()
  2866. y_data = self.y_testdata
  2867. x_data = self.x_testdata
  2868. conversion_control(y_data, x_data, tab)
  2869. save = save_dir + r"/反正切函数标准化.HTML"
  2870. tab.render(save) # 生成HTML
  2871. return (save,)
  2872. class DecimalscalerModel(PrepBase): # 小数定标准化
  2873. def __init__(self, *args, **kwargs):
  2874. super(DecimalscalerModel, self).__init__(*args, **kwargs)
  2875. self.model = None
  2876. self.k = {}
  2877. self.model_Name = "Decimal_normalization"
  2878. def fit_model(self, x_data, *args, **kwargs):
  2879. if not self.have_predict: # 不允许第二次训练
  2880. self.j = max([judging_digits(x_data.max()), judging_digits(x_data.min())])
  2881. self.have_fit = True
  2882. return "None", "None"
  2883. def predict(self, x_data, *args, **kwargs):
  2884. self.x_testdata = x_data.copy()
  2885. try:
  2886. j = self.j
  2887. except BaseException:
  2888. self.have_fit = False
  2889. self.fit_model(x_data)
  2890. j = self.j
  2891. x_predict = x_data / (10 ** j)
  2892. self.y_testdata = x_predict.copy()
  2893. self.have_predict = True
  2894. return x_predict, "小数定标标准化"
  2895. def data_visualization(self, save_dir, *args, **kwargs):
  2896. tab = Tab()
  2897. y_data = self.y_testdata
  2898. x_data = self.x_testdata
  2899. j = self.j
  2900. conversion_control(y_data, x_data, tab)
  2901. tab.add(make_tab(heard=["小数位数:j"], row=[[j]]), "数据表格")
  2902. save = save_dir + r"/小数定标标准化.HTML"
  2903. tab.render(save) # 生成HTML
  2904. return (save,)
  2905. class MapzoomModel(PrepBase): # 映射标准化
  2906. def __init__(self, args_use, *args, **kwargs):
  2907. super(MapzoomModel, self).__init__(*args, **kwargs)
  2908. self.model = None
  2909. self.feature_range = args_use["feature_range"]
  2910. self.k = {}
  2911. self.model_Name = "Decimal_normalization"
  2912. def fit_model(self, x_data, *args, **kwargs):
  2913. if not self.have_predict: # 不允许第二次训练
  2914. self.max_ = x_data.max()
  2915. self.min_ = x_data.min()
  2916. self.have_fit = True
  2917. return "None", "None"
  2918. def predict(self, x_data, *args, **kwargs):
  2919. self.x_testdata = x_data.copy()
  2920. try:
  2921. max_ = self.max_
  2922. min_ = self.min_
  2923. except BaseException:
  2924. self.have_fit = False
  2925. self.fit_model(x_data)
  2926. max_ = self.max_
  2927. min_ = self.min_
  2928. x_predict = (x_data * (self.feature_range[1] - self.feature_range[0])) / (
  2929. max_ - min_
  2930. )
  2931. self.y_testdata = x_predict.copy()
  2932. self.have_predict = True
  2933. return x_predict, "映射标准化"
  2934. def data_visualization(self, save_dir, *args, **kwargs):
  2935. tab = Tab()
  2936. y_data = self.y_testdata
  2937. x_data = self.x_testdata
  2938. max_ = self.max_
  2939. min_ = self.min_
  2940. conversion_control(y_data, x_data, tab)
  2941. tab.add(make_tab(heard=["最大值", "最小值"], row=[[max_, min_]]), "数据表格")
  2942. save = save_dir + r"/映射标准化.HTML"
  2943. tab.render(save) # 生成HTML
  2944. return (save,)
  2945. class SigmodscalerModel(PrepBase): # sigmod变换
  2946. def __init__(self, *args, **kwargs):
  2947. super(SigmodscalerModel, self).__init__(*args, **kwargs)
  2948. self.model = None
  2949. self.k = {}
  2950. self.model_Name = "sigmodScaler_Model"
  2951. def fit_model(self, x_data, *args, **kwargs):
  2952. self.have_fit = True
  2953. return "None", "None"
  2954. def predict(self, x_data: np.array, *args, **kwargs):
  2955. self.x_testdata = x_data.copy()
  2956. x_predict = 1 / (1 + np.exp(-x_data))
  2957. self.y_testdata = x_predict.copy()
  2958. self.have_predict = True
  2959. return x_predict, "Sigmod变换"
  2960. def data_visualization(self, save_dir, *args, **kwargs):
  2961. tab = Tab()
  2962. y_data = self.y_testdata
  2963. x_data = self.x_testdata
  2964. conversion_control(y_data, x_data, tab)
  2965. save = save_dir + r"/Sigmoid变换.HTML"
  2966. tab.render(save) # 生成HTML
  2967. return (save,)
  2968. class FuzzyQuantizationModel(PrepBase): # 模糊量化标准化
  2969. def __init__(self, args_use, *args, **kwargs):
  2970. super(FuzzyQuantizationModel, self).__init__(*args, **kwargs)
  2971. self.model = None
  2972. self.feature_range = args_use["feature_range"]
  2973. self.k = {}
  2974. self.model_Name = "Fuzzy_quantization"
  2975. def fit_model(self, x_data, *args, **kwargs):
  2976. if not self.have_predict: # 不允许第二次训练
  2977. self.max_ = x_data.max()
  2978. self.max_ = x_data.min()
  2979. self.have_fit = True
  2980. return "None", "None"
  2981. def predict(self, x_data, *args, **kwargs):
  2982. self.x_testdata = x_data.copy()
  2983. try:
  2984. max_ = self.max_
  2985. min_ = self.max_
  2986. except BaseException:
  2987. self.have_fit = False
  2988. self.fit_model(x_data)
  2989. max_ = self.max_
  2990. min_ = self.max_
  2991. x_predict = 1 / 2 + (1 / 2) * np.sin(
  2992. np.pi / (max_ - min_) * (x_data - (max_ - min_) / 2)
  2993. )
  2994. self.y_testdata = x_predict.copy()
  2995. self.have_predict = True
  2996. return x_predict, "模糊量化标准化"
  2997. def data_visualization(self, save_dir, *args, **kwargs):
  2998. tab = Tab()
  2999. y_data = self.y_traindata
  3000. x_data = self.x_traindata
  3001. max_ = self.max_
  3002. min_ = self.max_
  3003. conversion_control(y_data, x_data, tab)
  3004. tab.add(make_tab(heard=["最大值", "最小值"], row=[[max_, min_]]), "数据表格")
  3005. save = save_dir + r"/模糊量化标准化.HTML"
  3006. tab.render(save) # 生成HTML
  3007. return (save,)
  3008. class RegularizationModel(Unsupervised): # 正则化
  3009. def __init__(self, args_use, *args, **kwargs):
  3010. super(RegularizationModel, self).__init__(*args, **kwargs)
  3011. self.model = Normalizer(norm=args_use["norm"])
  3012. self.k = {"norm": args_use["norm"]}
  3013. self.model_Name = "Regularization"
  3014. def data_visualization(self, save_dir, *args, **kwargs):
  3015. tab = Tab()
  3016. y_data = self.y_testdata.copy()
  3017. x_data = self.x_testdata.copy()
  3018. conversion_control(y_data, x_data, tab)
  3019. save = save_dir + r"/正则化.HTML"
  3020. tab.render(save) # 生成HTML
  3021. return (save,)
  3022. # 离散数据
  3023. class BinarizerModel(Unsupervised): # 二值化
  3024. def __init__(self, args_use, *args, **kwargs):
  3025. super(BinarizerModel, self).__init__(*args, **kwargs)
  3026. self.model = Binarizer(threshold=args_use["threshold"])
  3027. self.k = {}
  3028. self.model_Name = "Binarizer"
  3029. def data_visualization(self, save_dir, *args, **kwargs):
  3030. tab = Tab()
  3031. y_data = self.y_testdata
  3032. x_data = self.x_testdata
  3033. get_y = discrete_feature_visualization(y_data, "转换数据") # 转换
  3034. for i in range(len(get_y)):
  3035. tab.add(get_y[i], f"[{i}]数据x-x离散散点图")
  3036. heard = [f"特征:{i}" for i in range(len(x_data[0]))]
  3037. tab.add(make_tab(heard, x_data.tolist()), f"原数据")
  3038. tab.add(make_tab(heard, y_data.tolist()), f"编码数据")
  3039. tab.add(make_tab(heard, np.dstack((x_data, y_data)).tolist()), f"合成[原数据,编码]数据")
  3040. save = save_dir + r"/二值离散化.HTML"
  3041. tab.render(save) # 生成HTML
  3042. return (save,)
  3043. class DiscretizationModel(PrepBase): # n值离散
  3044. def __init__(self, args_use, *args, **kwargs):
  3045. super(DiscretizationModel, self).__init__(*args, **kwargs)
  3046. self.model = None
  3047. range_ = args_use["split_range"]
  3048. if range_ == []:
  3049. raise Exception
  3050. elif len(range_) == 1:
  3051. range_.append(range_[0])
  3052. self.range = range_
  3053. self.k = {}
  3054. self.model_Name = "Discretization"
  3055. def fit_model(self, *args, **kwargs):
  3056. # t值在模型创建时已经保存
  3057. self.have_fit = True
  3058. return "None", "None"
  3059. def predict(self, x_data, *args, **kwargs):
  3060. self.x_testdata = x_data.copy()
  3061. x_predict = x_data.copy() # 复制
  3062. range_ = self.range
  3063. bool_list = []
  3064. max_ = len(range_) - 1
  3065. o_t = None
  3066. for i in range(len(range_)):
  3067. try:
  3068. t = float(range_[i])
  3069. except BaseException:
  3070. continue
  3071. if o_t is None: # 第一个参数
  3072. bool_list.append(x_predict <= t)
  3073. else:
  3074. bool_list.append((o_t <= x_predict) == (x_predict < t))
  3075. if i == max_:
  3076. bool_list.append(t <= x_predict)
  3077. o_t = t
  3078. for i in range(len(bool_list)):
  3079. x_predict[bool_list[i]] = i
  3080. self.y_testdata = x_predict.copy()
  3081. self.have_predict = True
  3082. return x_predict, f"{len(bool_list)}值离散化"
  3083. def data_visualization(self, save_dir, *args, **kwargs):
  3084. tab = Tab()
  3085. y_data = self.y_testdata
  3086. x_data = self.x_testdata
  3087. get_y = discrete_feature_visualization(y_data, "转换数据") # 转换
  3088. for i in range(len(get_y)):
  3089. tab.add(get_y[i], f"[{i}]数据x-x离散散点图")
  3090. heard = [f"特征:{i}" for i in range(len(x_data[0]))]
  3091. tab.add(make_tab(heard, x_data.tolist()), f"原数据")
  3092. tab.add(make_tab(heard, y_data.tolist()), f"编码数据")
  3093. tab.add(make_tab(heard, np.dstack((x_data, y_data)).tolist()), f"合成[原数据,编码]数据")
  3094. save = save_dir + r"/多值离散化.HTML"
  3095. tab.render(save) # 生成HTML
  3096. return (save,)
  3097. class LabelModel(PrepBase): # 数字编码
  3098. def __init__(self, *args, **kwargs):
  3099. super(LabelModel, self).__init__(*args, **kwargs)
  3100. self.model = []
  3101. self.k = {}
  3102. self.model_Name = "LabelEncoder"
  3103. def fit_model(self, x_data, *args, **kwargs):
  3104. if not self.have_predict: # 不允许第二次训练
  3105. self.model = []
  3106. if x_data.ndim == 1:
  3107. x_data = np.array([x_data])
  3108. for i in range(x_data.shape[1]):
  3109. self.model.append(
  3110. LabelEncoder().fit(np.ravel(x_data[:, i]))
  3111. ) # 训练机器(每个特征一个学习器)
  3112. self.have_fit = True
  3113. return "None", "None"
  3114. def predict(self, x_data, *args, **kwargs):
  3115. self.x_testdata = x_data.copy()
  3116. x_predict = x_data.copy()
  3117. if x_data.ndim == 1:
  3118. x_data = np.array([x_data])
  3119. for i in range(x_data.shape[1]):
  3120. x_predict[:, i] = self.model[i].transform(x_data[:, i])
  3121. self.y_testdata = x_predict.copy()
  3122. self.have_predict = True
  3123. return x_predict, "数字编码"
  3124. def data_visualization(self, save_dir, *args, **kwargs):
  3125. tab = Tab()
  3126. x_data = self.x_testdata
  3127. y_data = self.y_testdata
  3128. get_y = discrete_feature_visualization(y_data, "转换数据") # 转换
  3129. for i in range(len(get_y)):
  3130. tab.add(get_y[i], f"[{i}]数据x-x离散散点图")
  3131. heard = [f"特征:{i}" for i in range(len(x_data[0]))]
  3132. tab.add(make_tab(heard, x_data.tolist()), f"原数据")
  3133. tab.add(make_tab(heard, y_data.tolist()), f"编码数据")
  3134. tab.add(make_tab(heard, np.dstack((x_data, y_data)).tolist()), f"合成[原数据,编码]数据")
  3135. save = save_dir + r"/数字编码.HTML"
  3136. tab.render(save) # 生成HTML
  3137. return (save,)
  3138. class OneHotEncoderModel(PrepBase): # 独热编码
  3139. def __init__(self, args_use, *args, **kwargs):
  3140. super(OneHotEncoderModel, self).__init__(*args, **kwargs)
  3141. self.model = []
  3142. self.ndim_up = args_use["ndim_up"]
  3143. self.k = {}
  3144. self.model_Name = "OneHotEncoder"
  3145. self.OneHot_Data = None # 三维独热编码
  3146. def fit_model(self, x_data, *args, **kwargs):
  3147. if not self.have_predict: # 不允许第二次训练
  3148. if x_data.ndim == 1:
  3149. x_data = [x_data]
  3150. for i in range(x_data.shape[1]):
  3151. data = np.expand_dims(x_data[:, i], axis=1) # 独热编码需要升维
  3152. self.model.append(OneHotEncoder().fit(data)) # 训练机器
  3153. self.have_fit = True
  3154. return "None", "None"
  3155. def predict(self, x_data, *args, **kwargs):
  3156. self.x_testdata = x_data.copy()
  3157. x_new = []
  3158. for i in range(x_data.shape[1]):
  3159. data = np.expand_dims(x_data[:, i], axis=1) # 独热编码需要升维
  3160. one_hot = self.model[i].transform(data).toarray().tolist()
  3161. x_new.append(one_hot) # 添加到列表中
  3162. # 新列表的行数据是原data列数据的独热码(只需要ndim=2,暂时没想到numpy的做法)
  3163. x_new = np.array(x_new)
  3164. x_predict = []
  3165. for i in range(x_new.shape[1]):
  3166. x_predict.append(x_new[:, i])
  3167. x_predict = np.array(x_predict) # 转换回array
  3168. self.OneHot_Data = x_predict.copy() # 保存未降维数据
  3169. if not self.ndim_up: # 压缩操作
  3170. new_x_predict = []
  3171. for i in x_predict:
  3172. new_list = []
  3173. list_ = i.tolist()
  3174. for a in list_:
  3175. new_list += a
  3176. new = np.array(new_list)
  3177. new_x_predict.append(new)
  3178. self.y_testdata = np.array(new_x_predict)
  3179. return self.y_testdata.copy(), "独热编码"
  3180. self.y_testdata = self.OneHot_Data
  3181. self.have_predict = True
  3182. return x_predict, "独热编码"
  3183. def data_visualization(self, save_dir, *args, **kwargs):
  3184. tab = Tab()
  3185. y_data = self.y_testdata
  3186. x_data = self.x_testdata
  3187. oh_data = self.OneHot_Data
  3188. if not self.ndim_up:
  3189. get_y = discrete_feature_visualization(y_data, "转换数据") # 转换
  3190. for i in range(len(get_y)):
  3191. tab.add(get_y[i], f"[{i}]数据x-x离散散点图")
  3192. heard = [f"特征:{i}" for i in range(len(x_data[0]))]
  3193. tab.add(make_tab(heard, x_data.tolist()), f"原数据")
  3194. tab.add(make_tab(heard, oh_data.tolist()), f"编码数据")
  3195. tab.add(make_tab(heard, np.dstack((oh_data, x_data)).tolist()), f"合成[原数据,编码]数据")
  3196. tab.add(
  3197. make_tab([f"编码:{i}" for i in range(len(y_data[0]))], y_data.tolist()), f"数据"
  3198. )
  3199. save = save_dir + r"/独热编码.HTML"
  3200. tab.render(save) # 生成HTML
  3201. return (save,)
  3202. class MissedModel(Unsupervised): # 缺失数据补充
  3203. def __init__(self, args_use, *args, **kwargs):
  3204. super(MissedModel, self).__init__(*args, **kwargs)
  3205. self.model = SimpleImputer(
  3206. missing_values=args_use["miss_value"],
  3207. strategy=args_use["fill_method"],
  3208. fill_value=args_use["fill_value"],
  3209. )
  3210. self.k = {}
  3211. self.model_Name = "Missed"
  3212. def predict(self, x_data, *args, **kwargs):
  3213. self.x_testdata = x_data.copy()
  3214. x_predict = self.model.transform(x_data)
  3215. self.y_testdata = x_predict.copy()
  3216. self.have_predict = True
  3217. return x_predict, "填充缺失"
  3218. def data_visualization(self, save_dir, *args, **kwargs):
  3219. tab = Tab()
  3220. y_data = self.y_testdata
  3221. x_data = self.x_testdata
  3222. statistics = self.model.statistics_.tolist()
  3223. conversion_control(y_data, x_data, tab)
  3224. tab.add(
  3225. make_tab([f"特征[{i}]" for i in range(len(statistics))], [statistics]), "填充值"
  3226. )
  3227. save = save_dir + r"/缺失数据填充.HTML"
  3228. tab.render(save) # 生成HTML
  3229. return (save,)
  3230. class PcaModel(Unsupervised):
  3231. def __init__(self, args_use, *args, **kwargs):
  3232. super(PcaModel, self).__init__(*args, **kwargs)
  3233. self.model = PCA(
  3234. n_components=args_use["n_components"], whiten=args_use["white_PCA"]
  3235. )
  3236. self.whiten = args_use["white_PCA"]
  3237. self.n_components = args_use["n_components"]
  3238. self.k = {
  3239. "n_components": args_use["n_components"],
  3240. "whiten": args_use["white_PCA"],
  3241. }
  3242. self.model_Name = "PCA"
  3243. def predict(self, x_data, *args, **kwargs):
  3244. self.x_testdata = x_data.copy()
  3245. x_predict = self.model.transform(x_data)
  3246. self.y_testdata = x_predict.copy()
  3247. self.have_predict = True
  3248. return x_predict, "PCA"
  3249. def data_visualization(self, save_dir, *args, **kwargs):
  3250. tab = Tab()
  3251. y_data = self.y_testdata
  3252. importance = self.model.components_.tolist()
  3253. var = self.model.explained_variance_.tolist() # 方量差
  3254. conversion_separate_format(y_data, tab)
  3255. x_data = [f"第{i+1}主成分" for i in range(len(importance))] # 主成分
  3256. y_data = [f"特征[{i}]" for i in range(len(importance[0]))] # 主成分
  3257. value = [
  3258. (f"第{i+1}主成分", f"特征[{j}]", importance[i][j])
  3259. for i in range(len(importance))
  3260. for j in range(len(importance[i]))
  3261. ]
  3262. c = (
  3263. HeatMap()
  3264. .add_xaxis(x_data)
  3265. .add_yaxis(f"", y_data, value, **label_setting) # value的第一个数值是x
  3266. .set_global_opts(
  3267. title_opts=opts.TitleOpts(title="预测热力图"),
  3268. **global_not_legend,
  3269. yaxis_opts=opts.AxisOpts(is_scale=True), # 'category'
  3270. xaxis_opts=opts.AxisOpts(is_scale=True),
  3271. visualmap_opts=opts.VisualMapOpts(
  3272. is_show=True,
  3273. max_=int(self.model.components_.max()) + 1,
  3274. min_=int(self.model.components_.min()),
  3275. pos_right="3%",
  3276. ),
  3277. ) # 显示
  3278. )
  3279. tab.add(c, "成分热力图")
  3280. c = (
  3281. Bar()
  3282. .add_xaxis([f"第[{i}]主成分" for i in range(len(var))])
  3283. .add_yaxis("方量差", var, **label_setting)
  3284. .set_global_opts(
  3285. title_opts=opts.TitleOpts(title="方量差柱状图"), **global_setting
  3286. )
  3287. )
  3288. des_to_csv(save_dir, "成分重要性", importance, [x_data], [y_data])
  3289. des_to_csv(save_dir, "方量差", [var], [f"第[{i}]主成分" for i in range(len(var))])
  3290. tab.add(c, "方量差柱状图")
  3291. save = save_dir + r"/主成分分析.HTML"
  3292. tab.render(save) # 生成HTML
  3293. return (save,)
  3294. class RpcaModel(Unsupervised):
  3295. def __init__(self, args_use, *args, **kwargs):
  3296. super(RpcaModel, self).__init__(*args, **kwargs)
  3297. self.model = IncrementalPCA(
  3298. n_components=args_use["n_components"], whiten=args_use["white_PCA"]
  3299. )
  3300. self.n_components = args_use["n_components"]
  3301. self.whiten = args_use["white_PCA"]
  3302. self.k = {
  3303. "n_components": args_use["n_components"],
  3304. "whiten": args_use["white_PCA"],
  3305. }
  3306. self.model_Name = "RPCA"
  3307. def predict(self, x_data, *args, **kwargs):
  3308. self.x_testdata = x_data.copy()
  3309. x_predict = self.model.transform(x_data)
  3310. self.y_testdata = x_predict.copy()
  3311. self.have_predict = True
  3312. return x_predict, "RPCA"
  3313. def data_visualization(self, save_dir, *args, **kwargs):
  3314. tab = Tab()
  3315. y_data = self.y_traindata
  3316. importance = self.model.components_.tolist()
  3317. var = self.model.explained_variance_.tolist() # 方量差
  3318. conversion_separate_format(y_data, tab)
  3319. x_data = [f"第{i + 1}主成分" for i in range(len(importance))] # 主成分
  3320. y_data = [f"特征[{i}]" for i in range(len(importance[0]))] # 主成分
  3321. value = [
  3322. (f"第{i + 1}主成分", f"特征[{j}]", importance[i][j])
  3323. for i in range(len(importance))
  3324. for j in range(len(importance[i]))
  3325. ]
  3326. c = (
  3327. HeatMap()
  3328. .add_xaxis(x_data)
  3329. .add_yaxis(f"", y_data, value, **label_setting) # value的第一个数值是x
  3330. .set_global_opts(
  3331. title_opts=opts.TitleOpts(title="预测热力图"),
  3332. **global_not_legend,
  3333. yaxis_opts=opts.AxisOpts(is_scale=True), # 'category'
  3334. xaxis_opts=opts.AxisOpts(is_scale=True),
  3335. visualmap_opts=opts.VisualMapOpts(
  3336. is_show=True,
  3337. max_=int(self.model.components_.max()) + 1,
  3338. min_=int(self.model.components_.min()),
  3339. pos_right="3%",
  3340. ),
  3341. ) # 显示
  3342. )
  3343. tab.add(c, "成分热力图")
  3344. c = (
  3345. Bar()
  3346. .add_xaxis([f"第[{i}]主成分" for i in range(len(var))])
  3347. .add_yaxis("放量差", var, **label_setting)
  3348. .set_global_opts(
  3349. title_opts=opts.TitleOpts(title="方量差柱状图"), **global_setting
  3350. )
  3351. )
  3352. tab.add(c, "方量差柱状图")
  3353. des_to_csv(save_dir, "成分重要性", importance, [x_data], [y_data])
  3354. des_to_csv(save_dir, "方量差", [var], [f"第[{i}]主成分" for i in range(len(var))])
  3355. save = save_dir + r"/RPCA(主成分分析).HTML"
  3356. tab.render(save) # 生成HTML
  3357. return (save,)
  3358. class KpcaModel(Unsupervised):
  3359. def __init__(self, args_use, *args, **kwargs):
  3360. super(KpcaModel, self).__init__(*args, **kwargs)
  3361. self.model = KernelPCA(
  3362. n_components=args_use["n_components"], kernel=args_use["kernel"]
  3363. )
  3364. self.n_components = args_use["n_components"]
  3365. self.kernel = args_use["kernel"]
  3366. self.k = {
  3367. "n_components": args_use["n_components"],
  3368. "kernel": args_use["kernel"],
  3369. }
  3370. self.model_Name = "KPCA"
  3371. def predict(self, x_data, *args, **kwargs):
  3372. self.x_testdata = x_data.copy()
  3373. x_predict = self.model.transform(x_data)
  3374. self.y_testdata = x_predict.copy()
  3375. self.have_predict = True
  3376. return x_predict, "KPCA"
  3377. def data_visualization(self, save_dir, *args, **kwargs):
  3378. tab = Tab()
  3379. y_data = self.y_testdata
  3380. conversion_separate_format(y_data, tab)
  3381. save = save_dir + r"/KPCA(主成分分析).HTML"
  3382. tab.render(save) # 生成HTML
  3383. return (save,)
  3384. class LdaModel(PrepBase): # 有监督学习
  3385. def __init__(self, args_use, *args, **kwargs):
  3386. super(LdaModel, self).__init__(*args, **kwargs)
  3387. self.model = Lda(n_components=args_use["n_components"])
  3388. self.n_components = args_use["n_components"]
  3389. self.k = {"n_components": args_use["n_components"]}
  3390. self.model_Name = "LDA"
  3391. def predict(self, x_data, *args, **kwargs):
  3392. self.x_testdata = x_data.copy()
  3393. x_predict = self.model.transform(x_data)
  3394. self.y_testdata = x_predict.copy()
  3395. self.have_predict = True
  3396. return x_predict, "LDA"
  3397. def data_visualization(self, save_dir, *args, **kwargs):
  3398. tab = Tab()
  3399. x_data = self.x_testdata
  3400. y_data = self.y_testdata
  3401. conversion_separate_format(y_data, tab)
  3402. w_list = self.model.coef_.tolist() # 变为表格
  3403. b = self.model.intercept_
  3404. tab = Tab()
  3405. x_means = quick_stats(x_data).get()[0]
  3406. # 回归的y是历史遗留问题 不用分类回归:因为得不到分类数据(predict结果是降维数据不是预测数据)
  3407. get = regress_w(x_data, w_list, b, x_means.copy())
  3408. for i in range(len(get)):
  3409. tab.add(get[i].overlap(get[i]), f"类别:{i}LDA映射曲线")
  3410. save = save_dir + r"/render.HTML"
  3411. tab.render(save) # 生成HTML
  3412. return (save,)
  3413. class NmfModel(Unsupervised):
  3414. def __init__(self, args_use, *args, **kwargs):
  3415. super(NmfModel, self).__init__(*args, **kwargs)
  3416. self.model = NMF(n_components=args_use["n_components"])
  3417. self.n_components = args_use["n_components"]
  3418. self.k = {"n_components": args_use["n_components"]}
  3419. self.model_Name = "NFM"
  3420. self.h_testdata = None
  3421. # x_traindata保存的是W,h_traindata和y_traindata是后来数据
  3422. def predict(self, x_data, x_name="", add_func=None, *args, **kwargs):
  3423. self.x_testdata = x_data.copy()
  3424. x_predict = self.model.transform(x_data)
  3425. self.y_testdata = x_predict.copy()
  3426. self.h_testdata = self.model.components_
  3427. if add_func is not None and x_name != "":
  3428. add_func(self.h_testdata, f"{x_name}:V->NMF[H]")
  3429. self.have_predict = True
  3430. return x_predict, "V->NMF[W]"
  3431. def data_visualization(self, save_dir, *args, **kwargs):
  3432. tab = Tab()
  3433. y_data = self.y_testdata
  3434. x_data = self.x_testdata
  3435. h_data = self.h_testdata
  3436. conversion_separate_wh(y_data, h_data, tab)
  3437. wh_data = np.matmul(y_data, h_data)
  3438. difference_data = x_data - wh_data
  3439. def make_heat_map(data, name, max_, min_):
  3440. x = [f"数据[{i}]" for i in range(len(data))] # 主成分
  3441. y = [f"特征[{i}]" for i in range(len(data[0]))] # 主成分
  3442. value = [
  3443. (f"数据[{i}]", f"特征[{j}]", float(data[i][j]))
  3444. for i in range(len(data))
  3445. for j in range(len(data[i]))
  3446. ]
  3447. c = (
  3448. HeatMap()
  3449. .add_xaxis(x)
  3450. .add_yaxis(f"数据", y, value, **label_setting) # value的第一个数值是x
  3451. .set_global_opts(
  3452. title_opts=opts.TitleOpts(title="原始数据热力图"),
  3453. **global_not_legend,
  3454. yaxis_opts=opts.AxisOpts(
  3455. is_scale=True, type_="category"
  3456. ), # 'category'
  3457. xaxis_opts=opts.AxisOpts(is_scale=True, type_="category"),
  3458. visualmap_opts=opts.VisualMapOpts(
  3459. is_show=True, max_=max_, min_=min_, pos_right="3%"
  3460. ),
  3461. ) # 显示
  3462. )
  3463. tab.add(c, name)
  3464. max_ = (
  3465. max(int(x_data.max()), int(wh_data.max()), int(difference_data.max())) + 1
  3466. )
  3467. min_ = min(int(x_data.min()), int(wh_data.min()), int(difference_data.min()))
  3468. make_heat_map(x_data, "原始数据热力图", max_, min_)
  3469. make_heat_map(wh_data, "W * H数据热力图", max_, min_)
  3470. make_heat_map(difference_data, "数据差热力图", max_, min_)
  3471. des_to_csv(save_dir, "权重矩阵", y_data)
  3472. des_to_csv(save_dir, "系数矩阵", h_data)
  3473. des_to_csv(save_dir, "系数*权重矩阵", wh_data)
  3474. save = save_dir + r"/非负矩阵分解.HTML"
  3475. tab.render(save) # 生成HTML
  3476. return (save,)
  3477. class TsneModel(Unsupervised):
  3478. def __init__(self, args_use, *args, **kwargs):
  3479. super(TsneModel, self).__init__(*args, **kwargs)
  3480. self.model = TSNE(n_components=args_use["n_components"])
  3481. self.n_components = args_use["n_components"]
  3482. self.k = {"n_components": args_use["n_components"]}
  3483. self.model_Name = "t-SNE"
  3484. def fit_model(self, *args, **kwargs):
  3485. self.have_fit = True
  3486. return "None", "None"
  3487. def predict(self, x_data, *args, **kwargs):
  3488. self.x_testdata = x_data.copy()
  3489. x_predict = self.model.fit_transform(x_data)
  3490. self.y_testdata = x_predict.copy()
  3491. self.have_predict = True
  3492. return x_predict, "SNE"
  3493. def data_visualization(self, save_dir, *args, **kwargs):
  3494. tab = Tab()
  3495. y_data = self.y_testdata
  3496. conversion_separate_format(y_data, tab)
  3497. save = save_dir + r"/T-SNE.HTML"
  3498. tab.render(save) # 生成HTML
  3499. return (save,)
  3500. class MlpModel(StudyMachinebase): # 神经网络(多层感知机),有监督学习
  3501. def __init__(self, args_use, model, *args, **kwargs):
  3502. super(MlpModel, self).__init__(*args, **kwargs)
  3503. model = {"MLP": MLPRegressor, "MLP_class": MLPClassifier}[model]
  3504. self.model = model(
  3505. hidden_layer_sizes=args_use["hidden_size"],
  3506. activation=args_use["activation"],
  3507. solver=args_use["solver"],
  3508. alpha=args_use["alpha"],
  3509. max_iter=args_use["max_iter"],
  3510. )
  3511. # 记录这两个是为了克隆
  3512. self.hidden_layer_sizes = args_use["hidden_size"]
  3513. self.activation = args_use["activation"]
  3514. self.max_iter = args_use["max_iter"]
  3515. self.solver = args_use["solver"]
  3516. self.alpha = args_use["alpha"]
  3517. self.k = {
  3518. "hidden_layer_sizes": args_use["hidden_size"],
  3519. "activation": args_use["activation"],
  3520. "max_iter": args_use["max_iter"],
  3521. "solver": args_use["solver"],
  3522. "alpha": args_use["alpha"],
  3523. }
  3524. self.model_Name = model
  3525. def data_visualization(self, save_dir, *args, **kwargs):
  3526. tab = Tab()
  3527. x_data = self.x_testdata
  3528. y_data = self.y_testdata
  3529. coefs = self.model.coefs_
  3530. class_ = self.model.classes_
  3531. n_layers_ = self.model.n_layers_
  3532. def make_heat_map(data, name):
  3533. x = [f"特征(节点)[{i}]" for i in range(len(data))]
  3534. y = [f"节点[{i}]" for i in range(len(data[0]))]
  3535. value = [
  3536. (f"特征(节点)[{i}]", f"节点[{j}]", float(data[i][j]))
  3537. for i in range(len(data))
  3538. for j in range(len(data[i]))
  3539. ]
  3540. c = (
  3541. HeatMap()
  3542. .add_xaxis(x)
  3543. .add_yaxis(f"数据", y, value, **label_setting) # value的第一个数值是x
  3544. .set_global_opts(
  3545. title_opts=opts.TitleOpts(title=name),
  3546. **global_not_legend,
  3547. yaxis_opts=opts.AxisOpts(
  3548. is_scale=True, type_="category"
  3549. ), # 'category'
  3550. xaxis_opts=opts.AxisOpts(is_scale=True, type_="category"),
  3551. visualmap_opts=opts.VisualMapOpts(
  3552. is_show=True,
  3553. max_=float(data.max()),
  3554. min_=float(data.min()),
  3555. pos_right="3%",
  3556. ),
  3557. ) # 显示
  3558. )
  3559. tab.add(c, name)
  3560. tab.add(make_tab(x, data.transpose.tolist()), f"{name}:表格")
  3561. des_to_csv(save_dir, f"{name}:表格", data.transpose.tolist(), x, y)
  3562. get, x_means, x_range, data_type = regress_visualization(x_data, y_data)
  3563. for i in range(len(get)):
  3564. tab.add(get[i], f"{i}训练数据散点图")
  3565. get = prediction_boundary(x_range, x_means, self.predict, data_type)
  3566. for i in range(len(get)):
  3567. tab.add(get[i], f"{i}预测热力图")
  3568. heard = ["神经网络层数"]
  3569. data = [n_layers_]
  3570. for i in range(len(coefs)):
  3571. make_heat_map(coefs[i], f"{i}层权重矩阵")
  3572. heard.append(f"第{i}层节点数")
  3573. data.append(len(coefs[i][0]))
  3574. if self.model_Name == "MLP_class":
  3575. heard += [f"[{i}]类型" for i in range(len(class_))]
  3576. data += class_.tolist()
  3577. tab.add(make_tab(heard, [data]), "数据表")
  3578. save = save_dir + r"/多层感知机.HTML"
  3579. tab.render(save) # 生成HTML
  3580. return (save,)
  3581. class KmeansModel(UnsupervisedModel):
  3582. def __init__(self, args_use, *args, **kwargs):
  3583. super(KmeansModel, self).__init__(*args, **kwargs)
  3584. self.model = KMeans(n_clusters=args_use["n_clusters"])
  3585. self.class_ = []
  3586. self.n_clusters = args_use["n_clusters"]
  3587. self.k = {"n_clusters": args_use["n_clusters"]}
  3588. self.model_Name = "k-means"
  3589. def fit_model(self, x_data, *args, **kwargs):
  3590. re = super().fit_model(x_data, *args, **kwargs)
  3591. self.class_ = list(set(self.model.labels_.tolist()))
  3592. self.have_fit = True
  3593. return re
  3594. def predict(self, x_data, *args, **kwargs):
  3595. self.x_testdata = x_data.copy()
  3596. y_predict = self.model.predict(x_data)
  3597. self.y_testdata = y_predict.copy()
  3598. self.have_predict = True
  3599. return y_predict, "k-means"
  3600. def data_visualization(self, save_dir, *args, **kwargs):
  3601. tab = Tab()
  3602. y = self.y_testdata
  3603. x_data = self.x_testdata
  3604. class_ = self.class_
  3605. center = self.model.cluster_centers_
  3606. class_heard = [f"簇[{i}]" for i in range(len(class_))]
  3607. func = (
  3608. training_visualization_more
  3609. if more_global
  3610. else training_visualization_center
  3611. )
  3612. get, x_means, x_range, data_type = func(x_data, class_, y, center)
  3613. for i in range(len(get)):
  3614. tab.add(get[i], f"{i}数据散点图")
  3615. get = decision_boundary(x_range, x_means, self.predict, class_, data_type)
  3616. for i in range(len(get)):
  3617. tab.add(get[i], f"{i}预测热力图")
  3618. heard = class_heard + [f"普适预测第{i}特征" for i in range(len(x_means))]
  3619. data = class_ + [f"{i}" for i in x_means]
  3620. c = Table().add(headers=heard, rows=[data])
  3621. tab.add(c, "数据表")
  3622. des_to_csv(
  3623. save_dir,
  3624. "预测表",
  3625. [[f"{i}" for i in x_means]],
  3626. [f"普适预测第{i}特征" for i in range(len(x_means))],
  3627. )
  3628. save = save_dir + r"/k-means聚类.HTML"
  3629. tab.render(save) # 生成HTML
  3630. return (save,)
  3631. class AgglomerativeModel(UnsupervisedModel):
  3632. def __init__(self, args_use, *args, **kwargs):
  3633. super(AgglomerativeModel, self).__init__(*args, **kwargs)
  3634. self.model = AgglomerativeClustering(
  3635. n_clusters=args_use["n_clusters"]
  3636. ) # 默认为2,不同于k-means
  3637. self.class_ = []
  3638. self.n_clusters = args_use["n_clusters"]
  3639. self.k = {"n_clusters": args_use["n_clusters"]}
  3640. self.model_Name = "Agglomerative"
  3641. def fit_model(self, x_data, *args, **kwargs):
  3642. re = super().fit_model(x_data, *args, **kwargs)
  3643. self.class_ = list(set(self.model.labels_.tolist()))
  3644. self.have_fit = True
  3645. return re
  3646. def predict(self, x_data, *args, **kwargs):
  3647. self.x_testdata = x_data.copy()
  3648. y_predict = self.model.fit_predict(x_data)
  3649. self.y_traindata = y_predict.copy()
  3650. self.have_predict = True
  3651. return y_predict, "Agglomerative"
  3652. def data_visualization(self, save_dir, *args, **kwargs):
  3653. tab = Tab()
  3654. y = self.y_testdata
  3655. x_data = self.x_testdata
  3656. class_ = self.class_
  3657. class_heard = [f"簇[{i}]" for i in range(len(class_))]
  3658. func = (
  3659. training_visualization_more_no_center
  3660. if more_global
  3661. else training_visualization
  3662. )
  3663. get, x_means, x_range, data_type = func(x_data, class_, y)
  3664. for i in range(len(get)):
  3665. tab.add(get[i], f"{i}训练数据散点图")
  3666. get = decision_boundary(x_range, x_means, self.predict, class_, data_type)
  3667. for i in range(len(get)):
  3668. tab.add(get[i], f"{i}预测热力图")
  3669. linkage_array = ward(self.x_traindata) # self.y_traindata是结果
  3670. dendrogram(linkage_array)
  3671. plt.savefig(save_dir + r"/Cluster_graph.png")
  3672. image = Image()
  3673. image.add(src=save_dir + r"/Cluster_graph.png",).set_global_opts(
  3674. title_opts=opts.ComponentTitleOpts(title="聚类树状图")
  3675. )
  3676. tab.add(image, "聚类树状图")
  3677. heard = class_heard + [f"普适预测第{i}特征" for i in range(len(x_means))]
  3678. data = class_ + [f"{i}" for i in x_means]
  3679. c = Table().add(headers=heard, rows=[data])
  3680. tab.add(c, "数据表")
  3681. des_to_csv(
  3682. save_dir,
  3683. "预测表",
  3684. [[f"{i}" for i in x_means]],
  3685. [f"普适预测第{i}特征" for i in range(len(x_means))],
  3686. )
  3687. save = save_dir + r"/层次聚类.HTML"
  3688. tab.render(save) # 生成HTML
  3689. return (save,)
  3690. class DbscanModel(UnsupervisedModel):
  3691. def __init__(self, args_use, *args, **kwargs):
  3692. super(DbscanModel, self).__init__(*args, **kwargs)
  3693. self.model = DBSCAN(eps=args_use["eps"], min_samples=args_use["min_samples"])
  3694. # eps是距离(0.5),min_samples(5)是簇与噪音分界线(每个簇最小元素数)
  3695. # min_samples
  3696. self.eps = args_use["eps"]
  3697. self.min_samples = args_use["min_samples"]
  3698. self.k = {"min_samples": args_use["min_samples"], "eps": args_use["eps"]}
  3699. self.class_ = []
  3700. self.model_Name = "DBSCAN"
  3701. def fit_model(self, x_data, *args, **kwargs):
  3702. re = super().fit_model(x_data, *args, **kwargs)
  3703. self.class_ = list(set(self.model.labels_.tolist()))
  3704. self.have_fit = True
  3705. return re
  3706. def predict(self, x_data, *args, **kwargs):
  3707. self.x_testdata = x_data.copy()
  3708. y_predict = self.model.fit_predict(x_data)
  3709. self.y_testdata = y_predict.copy()
  3710. self.have_predict = True
  3711. return y_predict, "DBSCAN"
  3712. def data_visualization(self, save_dir, *args, **kwargs):
  3713. # DBSCAN没有预测的必要
  3714. tab = Tab()
  3715. y = self.y_testdata.copy()
  3716. x_data = self.x_testdata.copy()
  3717. class_ = self.class_
  3718. class_heard = [f"簇[{i}]" for i in range(len(class_))]
  3719. func = (
  3720. training_visualization_more_no_center
  3721. if more_global
  3722. else training_visualization
  3723. )
  3724. get, x_means, x_range, data_type = func(x_data, class_, y)
  3725. for i in range(len(get)):
  3726. tab.add(get[i], f"{i}训练数据散点图")
  3727. heard = class_heard + [f"普适预测第{i}特征" for i in range(len(x_means))]
  3728. data = class_ + [f"{i}" for i in x_means]
  3729. c = Table().add(headers=heard, rows=[data])
  3730. tab.add(c, "数据表")
  3731. des_to_csv(
  3732. save_dir,
  3733. "预测表",
  3734. [[f"{i}" for i in x_means]],
  3735. [f"普适预测第{i}特征" for i in range(len(x_means))],
  3736. )
  3737. save = save_dir + r"/密度聚类.HTML"
  3738. tab.render(save) # 生成HTML
  3739. return (save,)
  3740. class FastFourier(StudyMachinebase): # 快速傅里叶变换
  3741. def __init__(self, *args, **kwargs):
  3742. super(FastFourier, self).__init__(*args, **kwargs)
  3743. self.model = None
  3744. self.fourier = None # fft复数
  3745. self.frequency = None # 频率range
  3746. self.angular_Frequency = None # 角频率range
  3747. self.phase = None # 相位range
  3748. self.breadth = None # 震幅range
  3749. self.sample_size = None # 样本数
  3750. def fit_model(self, y_data, *args, **kwargs):
  3751. y_data = y_data.ravel() # 扯平为一维数组
  3752. try:
  3753. if self.y_traindata is None:
  3754. raise Exception
  3755. self.y_traindata = np.hstack(y_data, self.x_traindata)
  3756. except BaseException:
  3757. self.y_traindata = y_data.copy()
  3758. fourier = fft(y_data)
  3759. self.sample_size = len(y_data)
  3760. self.frequency = np.linspace(0, 1, self.sample_size) # 频率N_range
  3761. self.angular_Frequency = self.frequency / (np.pi * 2) # 角频率w
  3762. self.phase = np.angle(fourier)
  3763. self.breadth = np.abs(fourier)
  3764. self.fourier = fourier
  3765. self.have_fit = True
  3766. return "None", "None"
  3767. def predict(self, x_data, *args, **kwargs):
  3768. return np.array([]), ""
  3769. def data_visualization(self, save_dir, *args, **kwargs):
  3770. # DBSCAN没有预测的必要
  3771. tab = Tab()
  3772. y = self.y_traindata.copy()
  3773. n = self.sample_size
  3774. phase = self.phase # 相位range
  3775. breadth = self.breadth # 震幅range
  3776. normalization_breadth = breadth / n
  3777. def line(name, value, s=slice(0, None)) -> Line:
  3778. c = (
  3779. Line()
  3780. .add_xaxis(self.frequency[s].tolist())
  3781. .add_yaxis(
  3782. "",
  3783. value,
  3784. **label_setting,
  3785. symbol="none" if self.sample_size >= 500 else None,
  3786. )
  3787. .set_global_opts(
  3788. title_opts=opts.TitleOpts(title=name),
  3789. **global_not_legend,
  3790. xaxis_opts=opts.AxisOpts(type_="value"),
  3791. yaxis_opts=opts.AxisOpts(type_="value"),
  3792. )
  3793. )
  3794. return c
  3795. tab.add(line("原始数据", y.tolist()), "原始数据")
  3796. tab.add(line("双边振幅谱", breadth.tolist()), "双边振幅谱")
  3797. tab.add(line("双边振幅谱(归一化)", normalization_breadth.tolist()), "双边振幅谱(归一化)")
  3798. tab.add(
  3799. line("单边相位谱", breadth[: int(n / 2)].tolist(), slice(0, int(n / 2))), "单边相位谱"
  3800. )
  3801. tab.add(
  3802. line(
  3803. "单边相位谱(归一化)",
  3804. normalization_breadth[: int(n / 2)].tolist(),
  3805. slice(0, int(n / 2)),
  3806. ),
  3807. "单边相位谱(归一化)",
  3808. )
  3809. tab.add(line("双边相位谱", phase.tolist()), "双边相位谱")
  3810. tab.add(
  3811. line("单边相位谱", phase[: int(n / 2)].tolist(), slice(0, int(n / 2))), "单边相位谱"
  3812. )
  3813. tab.add(make_tab(self.frequency.tolist(), [breadth.tolist()]), "双边振幅谱")
  3814. tab.add(make_tab(self.frequency.tolist(), [phase.tolist()]), "双边相位谱")
  3815. tab.add(make_tab(self.frequency.tolist(), [self.fourier.tolist()]), "快速傅里叶变换")
  3816. save = save_dir + r"/快速傅里叶.HTML"
  3817. tab.render(save) # 生成HTML
  3818. return (save,)
  3819. class ReverseFastFourier(StudyMachinebase): # 快速傅里叶变换
  3820. def __init__(self, *args, **kwargs):
  3821. super(ReverseFastFourier, self).__init__(*args, **kwargs)
  3822. self.model = None
  3823. self.sample_size = None
  3824. self.y_testdata_real = None
  3825. self.phase = None
  3826. self.breadth = None
  3827. def fit_model(self, y_data, *args, **kwargs):
  3828. return "None", "None"
  3829. def predict(self, x_data, x_name="", add_func=None, *args, **kwargs):
  3830. self.x_testdata = x_data.ravel().astype(np.complex_)
  3831. fourier = ifft(self.x_testdata)
  3832. self.y_testdata = fourier.copy()
  3833. self.y_testdata_real = np.real(fourier)
  3834. self.sample_size = len(self.y_testdata_real)
  3835. self.phase = np.angle(self.x_testdata)
  3836. self.breadth = np.abs(self.x_testdata)
  3837. add_func(self.y_testdata_real.copy(), f"{x_name}:逆向快速傅里叶变换[实数]")
  3838. return fourier, "逆向快速傅里叶变换"
  3839. def data_visualization(self, save_dir, *args, **kwargs):
  3840. # DBSCAN没有预测的必要
  3841. tab = Tab()
  3842. y = self.y_testdata_real.copy()
  3843. y_data = self.y_testdata.copy()
  3844. n = self.sample_size
  3845. range_n = np.linspace(0, 1, n).tolist()
  3846. phase = self.phase # 相位range
  3847. breadth = self.breadth # 震幅range
  3848. def line(name, value, s=slice(0, None)) -> Line:
  3849. c = (
  3850. Line()
  3851. .add_xaxis(range_n[s])
  3852. .add_yaxis(
  3853. "", value, **label_setting, symbol="none" if n >= 500 else None
  3854. )
  3855. .set_global_opts(
  3856. title_opts=opts.TitleOpts(title=name),
  3857. **global_not_legend,
  3858. xaxis_opts=opts.AxisOpts(type_="value"),
  3859. yaxis_opts=opts.AxisOpts(type_="value"),
  3860. )
  3861. )
  3862. return c
  3863. tab.add(line("逆向傅里叶变换", y.tolist()), "逆向傅里叶变换[实数]")
  3864. tab.add(make_tab(range_n, [y_data.tolist()]), "逆向傅里叶变换数据")
  3865. tab.add(make_tab(range_n, [y.tolist()]), "逆向傅里叶变换数据[实数]")
  3866. tab.add(line("双边振幅谱", breadth.tolist()), "双边振幅谱")
  3867. tab.add(
  3868. line("单边相位谱", breadth[: int(n / 2)].tolist(), slice(0, int(n / 2))), "单边相位谱"
  3869. )
  3870. tab.add(line("双边相位谱", phase.tolist()), "双边相位谱")
  3871. tab.add(
  3872. line("单边相位谱", phase[: int(n / 2)].tolist(), slice(0, int(n / 2))), "单边相位谱"
  3873. )
  3874. save = save_dir + r"/快速傅里叶.HTML"
  3875. tab.render(save) # 生成HTML
  3876. return (save,)
  3877. class ReverseFastFourierTwonumpy(ReverseFastFourier): # 2快速傅里叶变换
  3878. def fit_model(self, x_data, y_data=None, x_name="", add_func=None, *args, **kwargs):
  3879. r = np.multiply(np.cos(x_data), y_data)
  3880. j = np.multiply(np.sin(x_data), y_data) * 1j
  3881. super(ReverseFastFourierTwonumpy, self).predict(
  3882. r + j, x_name=x_name, add_func=add_func, *args, **kwargs
  3883. )
  3884. return "None", "None"
  3885. class CurveFitting(StudyMachinebase): # 曲线拟合
  3886. def __init__(self, name, str_, model, *args, **kwargs):
  3887. super(CurveFitting, self).__init__(*args, **kwargs)
  3888. def ndim_down(data: np.ndarray):
  3889. if data.ndim == 1:
  3890. return data
  3891. new_data = []
  3892. for i in data:
  3893. new_data.append(np.sum(i))
  3894. return np.array(new_data)
  3895. named_domain = {"np": np, "Func": model, "ndimDown": ndim_down}
  3896. protection_func = f"""
  3897. def FUNC({",".join(model.__code__.co_varnames)}):
  3898. answer = Func({",".join(model.__code__.co_varnames)})
  3899. return ndimDown(answer)
  3900. """
  3901. exec(protection_func, named_domain)
  3902. self.func = named_domain["FUNC"]
  3903. self.fit_data = None
  3904. self.name = name
  3905. self.func_str = str_
  3906. def fit_model(self, x_data: np.ndarray, y_data: np.ndarray, *args, **kwargs):
  3907. y_data = y_data.ravel()
  3908. x_data = x_data.astype(np.float64)
  3909. try:
  3910. if self.x_traindata is None:
  3911. raise Exception
  3912. self.x_traindata = np.vstack(x_data, self.x_traindata)
  3913. self.y_traindata = np.vstack(y_data, self.y_traindata)
  3914. except BaseException:
  3915. self.x_traindata = x_data.copy()
  3916. self.y_traindata = y_data.copy()
  3917. self.fit_data = optimize.curve_fit(
  3918. self.func, self.x_traindata, self.y_traindata
  3919. )
  3920. self.model = self.fit_data[0].copy()
  3921. return "None", "None"
  3922. def predict(self, x_data, *args, **kwargs):
  3923. self.x_testdata = x_data.copy()
  3924. predict = self.func(x_data, *self.model)
  3925. y_predict = []
  3926. for i in predict:
  3927. y_predict.append(np.sum(i))
  3928. y_predict = np.array(y_predict)
  3929. self.y_testdata = y_predict.copy()
  3930. self.have_predict = True
  3931. return y_predict, self.name
  3932. def data_visualization(self, save_dir, *args, **kwargs):
  3933. # DBSCAN没有预测的必要
  3934. tab = Tab()
  3935. y = self.y_testdata.copy()
  3936. x_data = self.x_testdata.copy()
  3937. get, x_means, x_range, data_type = regress_visualization(x_data, y)
  3938. for i in range(len(get)):
  3939. tab.add(get[i], f"{i}预测类型图")
  3940. get = prediction_boundary(x_range, x_means, self.predict, data_type)
  3941. for i in range(len(get)):
  3942. tab.add(get[i], f"{i}预测热力图")
  3943. tab.add(
  3944. make_tab(
  3945. [f"普适预测第{i}特征" for i in range(len(x_means))],
  3946. [[f"{i}" for i in x_means]],
  3947. ),
  3948. "普适预测特征数据",
  3949. )
  3950. tab.add(
  3951. make_tab(
  3952. [f"参数[{i}]" for i in range(len(self.model))],
  3953. [[f"{i}" for i in self.model]],
  3954. ),
  3955. "拟合参数",
  3956. )
  3957. save = save_dir + r"/曲线拟合.HTML"
  3958. tab.render(save) # 生成HTML
  3959. return (save,)
  3960. class MachineLearner(Learner): # 数据处理者
  3961. def __init__(self, *args, **kwargs):
  3962. super().__init__(*args, **kwargs)
  3963. self.learner = {} # 记录机器
  3964. self.learn_dict = {
  3965. "Line": LineModel,
  3966. "Ridge": LineModel,
  3967. "Lasso": LineModel,
  3968. "LogisticRegression": LogisticregressionModel,
  3969. "Knn_class": KnnModel,
  3970. "Knn": KnnModel,
  3971. "Tree_class": TreeModel,
  3972. "Tree": TreeModel,
  3973. "Forest": ForestModel,
  3974. "Forest_class": ForestModel,
  3975. "GradientTree_class": GradienttreeModel,
  3976. "GradientTree": GradienttreeModel,
  3977. "Variance": VarianceModel,
  3978. "SelectKBest": SelectkbestModel,
  3979. "Z-Score": StandardizationModel,
  3980. "MinMaxScaler": MinmaxscalerModel,
  3981. "LogScaler": LogscalerModel,
  3982. "atanScaler": AtanscalerModel,
  3983. "decimalScaler": DecimalscalerModel,
  3984. "sigmodScaler": SigmodscalerModel,
  3985. "Mapzoom": MapzoomModel,
  3986. "Fuzzy_quantization": FuzzyQuantizationModel,
  3987. "Regularization": RegularizationModel,
  3988. "Binarizer": BinarizerModel,
  3989. "Discretization": DiscretizationModel,
  3990. "Label": LabelModel,
  3991. "OneHotEncoder": OneHotEncoderModel,
  3992. "Missed": MissedModel,
  3993. "PCA": PcaModel,
  3994. "RPCA": RpcaModel,
  3995. "KPCA": KpcaModel,
  3996. "LDA": LdaModel,
  3997. "SVC": SvcModel,
  3998. "SVR": SvrModel,
  3999. "MLP": MlpModel,
  4000. "MLP_class": MlpModel,
  4001. "NMF": NmfModel,
  4002. "t-SNE": TsneModel,
  4003. "k-means": KmeansModel,
  4004. "Agglomerative": AgglomerativeModel,
  4005. "DBSCAN": DbscanModel,
  4006. "ClassBar": ClassBar,
  4007. "FeatureScatter": NearFeatureScatter,
  4008. "FeatureScatterClass": NearFeatureScatterClass,
  4009. "FeatureScatter_all": NearFeatureScatterMore,
  4010. "FeatureScatterClass_all": NearFeatureScatterClassMore,
  4011. "HeatMap": NumpyHeatMap,
  4012. "FeatureY-X": FeatureScatterYX,
  4013. "ClusterTree": ClusterTree,
  4014. "MatrixScatter": MatrixScatter,
  4015. "Correlation": Corr,
  4016. "Statistics": DataAnalysis,
  4017. "Fast_Fourier": FastFourier,
  4018. "Reverse_Fast_Fourier": ReverseFastFourier,
  4019. "[2]Reverse_Fast_Fourier": ReverseFastFourierTwonumpy,
  4020. }
  4021. self.data_type = {} # 记录机器的类型
  4022. def learner_parameters(self, parameters, data_type): # 解析参数
  4023. original_parameter = {}
  4024. target_parameter = {}
  4025. # 输入数据
  4026. exec(parameters, original_parameter)
  4027. # 处理数据
  4028. if data_type in ("MLP", "MLP_class"):
  4029. target_parameter["alpha"] = float(
  4030. original_parameter.get("alpha", 0.0001)
  4031. ) # MLP正则化用
  4032. else:
  4033. target_parameter["alpha"] = float(
  4034. original_parameter.get("alpha", 1.0)
  4035. ) # L1和L2正则化用
  4036. target_parameter["C"] = float(original_parameter.get("C", 1.0)) # L1和L2正则化用
  4037. if data_type in ("MLP", "MLP_class"):
  4038. target_parameter["max_iter"] = int(
  4039. original_parameter.get("max_iter", 200)
  4040. ) # L1和L2正则化用
  4041. else:
  4042. target_parameter["max_iter"] = int(
  4043. original_parameter.get("max_iter", 1000)
  4044. ) # L1和L2正则化用
  4045. target_parameter["n_neighbors"] = int(
  4046. original_parameter.get("K_knn", 5)
  4047. ) # knn邻居数 (命名不同)
  4048. target_parameter["p"] = int(original_parameter.get("p", 2)) # 距离计算方式
  4049. target_parameter["nDim_2"] = bool(
  4050. original_parameter.get("nDim_2", True)
  4051. ) # 数据是否降维
  4052. if data_type in ("Tree", "Forest", "GradientTree"):
  4053. target_parameter["criterion"] = (
  4054. "mse" if bool(original_parameter.get("is_MSE", True)) else "mae"
  4055. ) # 是否使用基尼不纯度
  4056. else:
  4057. target_parameter["criterion"] = (
  4058. "gini" if bool(original_parameter.get("is_Gini", True)) else "entropy"
  4059. ) # 是否使用基尼不纯度
  4060. target_parameter["splitter"] = (
  4061. "random" if bool(original_parameter.get("is_random", False)) else "best"
  4062. ) # 决策树节点是否随机选用最优
  4063. target_parameter["max_features"] = original_parameter.get(
  4064. "max_features", None
  4065. ) # 选用最多特征数
  4066. target_parameter["max_depth"] = original_parameter.get(
  4067. "max_depth", None
  4068. ) # 最大深度
  4069. target_parameter["min_samples_split"] = int(
  4070. original_parameter.get("min_samples_split", 2)
  4071. ) # 是否继续划分(容易造成过拟合)
  4072. target_parameter["P"] = float(original_parameter.get("min_samples_split", 0.8))
  4073. target_parameter["k"] = original_parameter.get("k", 1)
  4074. target_parameter["score_func"] = {
  4075. "chi2": chi2,
  4076. "f_classif": f_classif,
  4077. "mutual_info_classif": mutual_info_classif,
  4078. "f_regression": f_regression,
  4079. "mutual_info_regression": mutual_info_regression,
  4080. }.get(original_parameter.get("score_func", "f_classif"), f_classif)
  4081. target_parameter["feature_range"] = tuple(
  4082. original_parameter.get("feature_range", (0, 1))
  4083. )
  4084. target_parameter["norm"] = original_parameter.get("norm", "l2") # 正则化的方式L1或者L2
  4085. target_parameter["threshold"] = float(
  4086. original_parameter.get("threshold", 0.0)
  4087. ) # 二值化特征
  4088. target_parameter["split_range"] = list(
  4089. original_parameter.get("split_range", [0])
  4090. ) # 二值化特征
  4091. target_parameter["ndim_up"] = bool(original_parameter.get("ndim_up", False))
  4092. target_parameter["miss_value"] = original_parameter.get("miss_value", np.nan)
  4093. target_parameter["fill_method"] = original_parameter.get("fill_method", "mean")
  4094. target_parameter["fill_value"] = original_parameter.get("fill_value", None)
  4095. target_parameter["n_components"] = original_parameter.get("n_components", 1)
  4096. target_parameter["kernel"] = original_parameter.get(
  4097. "kernel", "rbf" if data_type in ("SVR", "SVC") else "linear"
  4098. )
  4099. target_parameter["n_Tree"] = original_parameter.get("n_Tree", 100)
  4100. target_parameter["gamma"] = original_parameter.get("gamma", 1)
  4101. target_parameter["hidden_size"] = tuple(
  4102. original_parameter.get("hidden_size", (100,))
  4103. )
  4104. target_parameter["activation"] = str(
  4105. original_parameter.get("activation", "relu")
  4106. )
  4107. target_parameter["solver"] = str(original_parameter.get("solver", "adam"))
  4108. if data_type in ("k-means",):
  4109. target_parameter["n_clusters"] = int(
  4110. original_parameter.get("n_clusters", 8)
  4111. )
  4112. else:
  4113. target_parameter["n_clusters"] = int(
  4114. original_parameter.get("n_clusters", 2)
  4115. )
  4116. target_parameter["eps"] = float(original_parameter.get("n_clusters", 0.5))
  4117. target_parameter["min_samples"] = int(original_parameter.get("n_clusters", 5))
  4118. target_parameter["white_PCA"] = bool(original_parameter.get("white_PCA", False))
  4119. return target_parameter
  4120. def add_learner(self, learner_str, parameters=""):
  4121. get = self.learn_dict[learner_str]
  4122. name = f"Le[{len(self.learner)}]{learner_str}"
  4123. # 参数调节
  4124. args_use = self.learner_parameters(parameters, learner_str)
  4125. # 生成学习器
  4126. self.learner[name] = get(model=learner_str, args_use=args_use)
  4127. self.data_type[name] = learner_str
  4128. def add_curve_fitting(self, learner):
  4129. named_domain = {}
  4130. exec(learner, named_domain)
  4131. name = f'Le[{len(self.learner)}]{named_domain.get("name", "SELF")}'
  4132. func = named_domain.get("f", lambda x, k, b: k * x + b)
  4133. self.learner[name] = CurveFitting(name, learner, func)
  4134. self.data_type[name] = "Curve_fitting"
  4135. def add_select_from_model(self, learner, parameters=""):
  4136. model = self.get_learner(learner)
  4137. name = f"Le[{len(self.learner)}]SelectFrom_Model:{learner}"
  4138. # 参数调节
  4139. args_use = self.learner_parameters(parameters, "SelectFrom_Model")
  4140. # 生成学习器
  4141. self.learner[name] = SelectFromModel(
  4142. learner=model, args_use=args_use, Dic=self.learn_dict
  4143. )
  4144. self.data_type[name] = "SelectFrom_Model"
  4145. def add_predictive_heat_map(self, learner, parameters=""):
  4146. model = self.get_learner(learner)
  4147. name = f"Le[{len(self.learner)}]Predictive_HeatMap:{learner}"
  4148. # 生成学习器
  4149. args_use = self.learner_parameters(parameters, "Predictive_HeatMap")
  4150. self.learner[name] = PredictiveHeatmap(learner=model, args_use=args_use)
  4151. self.data_type[name] = "Predictive_HeatMap"
  4152. def add_predictive_heat_map_more(self, learner, parameters=""):
  4153. model = self.get_learner(learner)
  4154. name = f"Le[{len(self.learner)}]Predictive_HeatMap_More:{learner}"
  4155. # 生成学习器
  4156. args_use = self.learner_parameters(parameters, "Predictive_HeatMap_More")
  4157. self.learner[name] = PredictiveHeatmapMore(learner=model, args_use=args_use)
  4158. self.data_type[name] = "Predictive_HeatMap_More"
  4159. def add_view_data(self, learner, parameters=""):
  4160. model = self.get_learner(learner)
  4161. name = f"Le[{len(self.learner)}]View_data:{learner}"
  4162. # 生成学习器
  4163. args_use = self.learner_parameters(parameters, "View_data")
  4164. self.learner[name] = ViewData(learner=model, args_use=args_use)
  4165. self.data_type[name] = "View_data"
  4166. def return_learner(self):
  4167. return self.learner.copy()
  4168. def get_learner(self, name):
  4169. return self.learner[name]
  4170. def get_learner_type(self, name):
  4171. return self.data_type[name]
  4172. def fit_model(self, x_name, y_name, learner, split=0.3, *args, **kwargs):
  4173. x_data = self.get_sheet(x_name)
  4174. y_data = self.get_sheet(y_name)
  4175. model = self.get_learner(learner)
  4176. return model.fit_model(
  4177. x_data, y_data, split=split, x_name=x_name, add_func=self.add_form
  4178. )
  4179. def predict(self, x_name, learner, **kwargs):
  4180. x_data = self.get_sheet(x_name)
  4181. model = self.get_learner(learner)
  4182. y_data, name = model.predict(x_data, x_name=x_name, add_func=self.add_form)
  4183. self.add_form(y_data, f"{x_name}:{name}")
  4184. return y_data
  4185. def score(self, name_x, name_y, learner): # Score_Only表示仅评分 Fit_Simp 是普遍类操作
  4186. model = self.get_learner(learner)
  4187. x = self.get_sheet(name_x)
  4188. y = self.get_sheet(name_y)
  4189. return model.score(x, y)
  4190. def model_evaluation(self, learner, save_dir, name_x, name_y, func=0): # 显示参数
  4191. x = self.get_sheet(name_x)
  4192. y = self.get_sheet(name_y)
  4193. if new_dir_global:
  4194. dic = save_dir + f"/{learner}分类评分[CoTan]"
  4195. new_dic = dic
  4196. a = 0
  4197. while exists(new_dic): # 直到他不存在 —— False
  4198. new_dic = dic + f"[{a}]"
  4199. a += 1
  4200. mkdir(new_dic)
  4201. else:
  4202. new_dic = save_dir
  4203. model = self.get_learner(learner)
  4204. # 打包
  4205. func = [model.class_score, model.regression_score, model.clusters_score][func]
  4206. save = func(new_dic, x, y)[0]
  4207. if tar_global:
  4208. pack(f"{new_dic}.tar.gz", new_dic)
  4209. return save, new_dic
  4210. def model_visualization(self, learner, save_dir): # 显示参数
  4211. if new_dir_global:
  4212. dic = save_dir + f"/{learner}数据[CoTan]"
  4213. new_dic = dic
  4214. a = 0
  4215. while exists(new_dic): # 直到他不存在 —— False
  4216. new_dic = dic + f"[{a}]"
  4217. a += 1
  4218. mkdir(new_dic)
  4219. else:
  4220. new_dic = save_dir
  4221. model = self.get_learner(learner)
  4222. if (not (model.Model is None) or not (model.Model is list)) and clf_global:
  4223. joblib.dump(model.Model, new_dic + "/MODEL.model") # 保存模型
  4224. # 打包
  4225. save = model.data_visualization(new_dic)[0]
  4226. if tar_global:
  4227. pack(f"{new_dic}.tar.gz", new_dic)
  4228. return save, new_dic
  4229. def del_leaner(self, leaner):
  4230. del self.learner[leaner]
  4231. del self.data_type[leaner]
  4232. def pack(output_filename, source_dir):
  4233. with tarfile.open(output_filename, "w:gz") as tar:
  4234. tar.add(source_dir, arcname=basename(source_dir))
  4235. return output_filename
  4236. def set_global(
  4237. more=more_global,
  4238. all=all_global,
  4239. csv=csv_global,
  4240. clf=clf_global,
  4241. tar=tar_global,
  4242. new=new_dir_global,
  4243. ):
  4244. global more_global, all_global, csv_global, clf_global, tar_global, new_dir_global
  4245. more_global = more # 是否使用全部特征绘图
  4246. all_global = all # 是否导出charts
  4247. csv_global = csv # 是否导出CSV
  4248. clf_global = clf # 是否导出模型
  4249. tar_global = tar # 是否打包tar
  4250. new_dir_global = new # 是否新建目录