本章包括下面各节:
22.1 单词向量预训练模型
22.1.1 加载模型
22.1.2 查找相似的单词
22.1.3 单词向量
22.2 将单词映射为向量
详细内容请阅读纸质书《Alink权威指南:基于Flink的机器学习实例入门(Python)》,这里为本章对应的示例代码。
from pyalink.alink import * useLocalEnv(1) from utils import * import os import pandas as pd pd.set_option('display.max_colwidth', 1000) DATA_DIR = ROOT_DIR + "wordvec" + os.sep WIKI_DEPENDENCY = "deps.words"; GLOVE_6B_100D = "glove.6B.100d.txt"; ORIGIN_FILE = "三国演义.txt"; W2V_MODEL_FILE = "w2v_model.ak"; def getWikiDependency() : return TextSourceBatchOp()\ .setFilePath(DATA_DIR + WIKI_DEPENDENCY)\ .setTextCol("txt")\ .select("SUBSTRING(txt FROM 1 FOR POSITION(' ' IN txt)-1 ) AS word, " + "SUBSTRING(txt FROM POSITION(' ' IN txt) + 1 ) AS vec"); def getGlove6B100d() : return TextSourceBatchOp()\ .setFilePath(DATA_DIR + GLOVE_6B_100D)\ .setTextCol("txt")\ .select("SUBSTRING(txt FROM 1 FOR POSITION(' ' IN txt)-1 ) AS word, " + "SUBSTRING(txt FROM POSITION(' ' IN txt) + 1 ) AS vec");
#c_1_2 for word2vec in [getWikiDependency(), getGlove6B100d()] : for metric in ["EUCLIDEAN", "COSINE"] : VectorNearestNeighbor()\ .setIdCol("word")\ .setSelectedCol("vec")\ .setMetric(metric)\ .setOutputCol("similar_words")\ .setTopN(7)\ .fit(word2vec)\ .transform(word2vec.filter("word='king'"))\ .select("word, similar_words")\ .lazyPrint(-1, metric); BatchOperator.execute();
#c_1_3 getWikiDependency()\ .filter("word IN ('man', 'woman', 'king', 'queen')")\ .lazyPrint(-1); getGlove6B100d()\ .filter("word IN ('man', 'woman', 'king', 'queen')")\ .lazyPrint(-1); BatchOperator.execute();
#c_1_3_1 vec_man = DenseVector( [ -0.00220404170083,0.0678135463787,0.0415827872463,-0.0145794269917,-0.0180352093149,0.144706288126, -0.042149784084,-0.0216009491719,0.0638050780007,-0.0129117679223,0.151791574806,-0.0337671071952, -0.0565042238321,0.075027992403,0.0542110567498,-0.0134639105297,-0.0519148101319,-0.103488516415, -0.0298512060324,0.0140816291291,0.0432066082216,-0.0303133175126,-0.0883812208024,0.062041409882, -0.044512874738,0.0710490513363,-0.0567426161127,-0.0569431482211,0.0664918821847,-0.00548356590169 ,0.00485244226516,0.0314308266153,0.0779908678907,-0.0283226695098,0.0350599686745,0.0332841635629, -0.0607813363988,0.024102437776,0.0405754168265,-0.0247784864016,0.0431761751645,0.0862874989335, 0.0128255409272,0.0690049643344,0.0415603247518,-0.0194483538295,-0.0272676568636,-0.0585464993425, -0.00153650708527,0.0179571340315,-0.0639164847275,0.00511039865392,-0.0106836333846, -0.0404940808584,-0.0407839194974,0.0181438988046,0.0341792215101,-0.0105126285875,0.105609229508, 0.0762344457381,-0.0607018119222,-0.00303642204236,-0.0812955718233,-0.0164353000274, -0.047065085104,-0.0218728540451,0.11017690531,-0.0706059605168,0.0443634266897,0.00945761594138, -0.0247411696768,-0.147140570395,0.0117324869596,0.0322894735835,0.0834005249394,-0.0255103289652, -0.047516146486,-0.0727475057611,0.136698408827,-0.0337493545785,0.0436060420961,0.113064603901, -0.0783326963356,0.126881574122,-0.0566460636161,-0.00485932593284,-0.124211797958,0.00520622655395 ,-0.0841312806081,0.004456993672,0.0160320620208,0.0449710009368,-0.0171330865509,-0.0630871839216, 0.0419059573289,-0.0361901582176,-0.0500458943575,0.0579496128952,-0.042589976517,0.049046313351, -0.0349088902839,-0.0228651890856,0.00574351493109,-0.148883225213,0.00681972202772, -0.0342051258384,0.104763806427,-0.0229291347353,-0.0633724938319,-0.0688037076294,0.0442623454644, -0.00582756813642,0.0883218038814,0.0774441597578,0.0557479261333,0.0337419274634,0.0232256758929, 0.0116797725572,0.0445858778452,0.138361901466,0.0417927391105,0.0814153114111,0.0447528973609, 0.00998729605424,0.0480103213656,0.0885399074573,0.0713434187041,0.00199264064347,0.0680106366533, -0.0147074994403,-0.0373772285937,-0.043165849663,0.00223882233752,0.0466046039662,0.0441302877344, -0.0443967581332,0.0776084620363,0.00467093081745,-0.0657792414309,-0.0361483127641,0.0349045427043 ,0.0420131976253,-0.00180841195853,-0.0023663513387,0.0141508281042,0.0455017679446,0.0515291435928 ,0.0187927750578,-0.0236693101598,-0.0991280752401,-0.026072615925,-0.0035715366297, -0.0356289581528,-0.00706137499303,-0.028393317678,-0.0226537880282,-0.0562448182502, -0.00793795572701,0.0294197087586,-0.0522501172074,-0.0132137435543,-0.024405500303,-0.106365889505 ,-0.0284715741106,-0.112262113169,-0.027080167494,0.0371308657505,-0.0164880144299,0.0120855466518, -0.0615039403557,-0.0636583471896,0.0413688501009,-0.0335770817374,0.0115250711836,0.00234189620353 ,0.00517325740876,-0.0318260940593,-0.0752989915305,0.0732786350672,0.0173156848935,0.0220239324357 ,0.0362946812768,-0.0878689310079,0.133256756137,-0.0165050424499,0.00116569477643,-0.0684093459311 ,-0.0816989909791,0.0488830979674,0.045251238671,-0.0607186587931,-0.0260135613023,0.0855552940716, -0.12988285323,-0.0763797273559,-0.0939136969744,-0.0742133646781,0.0111490055494,-0.00542288093664 ,-0.0472696024937,-0.00213194433937,-0.00409415192574,0.025322114999,-0.0282545574296, 0.0245714329238,-0.00910890382877,0.0557037257409,0.0365459351471,0.0267329045746,0.0915252454395, 0.120522333232,-0.0476376975653,0.0422655383904,0.0116381082528,-0.020368048061,0.0774028577517, 0.172846359293,0.0562408329689,0.0931919987632,0.0471446095806,-0.0731395125204,0.0468344822368, 0.0484217110839,-0.073229905946,0.050022344968,0.0151862766422,-0.0681736708878,0.0626352167938, 0.0696656152823,-0.0243629302529,0.00182996870731,0.00116949890857,0.159671381657,0.0205299954006, 0.0500475246998,0.0362838123279,-0.0326323889233,0.079122506627,-0.0585609912745,0.0344496771901, 0.0229575751518,0.00719017203825,0.0602706769466,0.01418578989,-0.0499381106136,-0.123776315403, -0.0922469436507,0.0141167720641,-0.00585727659692,-0.0237772750528,0.0728150743939, -0.0151895373269,-0.00172906863109,0.0540235673801,0.0352646672133,-0.0869593811287, -0.0638471046033,-0.0194309635111,0.0705387541824,-0.0673945483962,0.0364571720639,-0.0613354716468 ,0.0492368822562,-0.0462057135392,-0.0107028351944,0.0119197951801,0.0393289295294,0.048293276337, -0.0136312923437,-0.179455586009,0.052450830465,-0.00182326618878,0.0396665915439,0.00621885029914, -0.0347118000093,0.0515389256469,0.0218607170521,-0.0478699307748,-0.0394441403884,-0.0236352541197 ,-0.00943768953494,-0.028646745338,-0.0799038029084,0.0192313371485,0.054759395225,0.0405284991968, 0.0752306983012,0.0205116993365,0.0747990198782,-0.0646151769968,0.00833883879464,0.050741325942, 0.0133485185215,-0.0694659889196,0.0447963731568,-0.000302881377808,-0.0743483207944, -0.0796594327059,-0.0164630158472,-0.0441532936764 ] ) vec_woman = DenseVector( [ -0.0923953735088,0.062354665309,-0.0682686456655,0.048195826923,-0.0095468221514,0.105997991417, -0.014845581153,0.0151306377161,0.072575366664,0.00120951798016,0.149595694754,-0.0258451283883, 0.0146608603837,0.0479136861009,-0.00676829247491,-0.0733037873761,-0.113669992093,-0.0400013940617 ,0.0148815991303,0.0490117198657,0.0486700636239,0.0238102841857,-0.0545149237669,0.068213075072, -0.0436013051345,0.133555174326,-0.0357794010668,-0.0117240231214,0.0670014989217,-0.0148994366047, -0.010707458591,0.00960856725532,0.0514741489129,-0.0597749205643,0.0573469658668,0.00388170886656, -0.0786130087425,0.0356509369478,0.0715052897102,0.0058419444019,-0.0451022257023,0.0326794538216, 0.0290673652421,0.0376176900219,-0.00313013374049,-0.0558396992744,-0.0120471558319, 0.00773083003938,0.0160659045263,0.0284042914317,-0.0337920662914,-0.00866609684962, -0.00168701345049,-0.0280652079027,-0.0222076557106,0.0599160767324,-0.00770630351199, -0.0667687541827,0.144065220098,0.0177952819787,-0.00562394988222,-0.02769559485,-0.108722151099, -0.0929776641416,-0.109614367851,-0.0185979683296,0.0672939305944,-0.0590288338919,0.0495305502528, 0.00501318789593,-0.0188578123087,-0.129812220429,-0.0227613034758,0.0181184146892,0.0540226780773, -0.0324755234645,-0.0495677688293,-0.1263689018,0.14630982614,-0.0534362711042,0.0445444615969, 0.0700842947491,-0.180258827904,0.0878787191569,-0.0861760979163,-0.0857423385612,-0.102763748571, 0.025485463158,-0.0720994148213,0.0178841263226,0.000425698188706,-0.0197947942607,0.024358271983, -0.109326738575,-0.039054121259,-0.021232426097,-0.0176808820222,0.0184684751256,-0.00301950709596, 0.0458568880836,-0.0280156403053,-0.0652055739684,-0.0428989545774,-0.124810523983, -0.000185749854299,-0.0701319756905,0.0403401345623,-0.0182767222751,-0.0878262358186, -0.0598584479688,0.0647704425,-0.0495142564059,0.0499855773658,0.0380457894091,0.0419012566065, -0.014114416214,0.0751813530779,-0.0190830104238,-0.00171359814801,0.082342927562,0.0802993361363, 0.134567107973,0.0650849995016,0.0141840509701,-0.00596440552468,0.0295458897975,0.0468542430261, 0.0334896867964,0.0427727201427,0.00837469426195,-0.0867474116417,-0.0196050995803,0.0606614773481, 0.0577649459175,-0.00796528992011,-0.00462316465616,0.133496859505,0.0233127215566,-0.0632115501401 ,-0.0167939822101,0.0507690541289,0.0441614704384,-0.0453769914148,0.0339716416353,-0.0274534168313 ,0.0892136140009,0.0720007941692,0.0383738960308,-0.065645679348,-0.0230317813338,0.0215128517773, -0.0497991414548,-0.0185619503524,-0.00282655364621,0.0283879975848,-0.0389481254973, -0.0275192782755,0.0505514026376,0.0402641537817,-0.0335474870742,-0.00767920427194, -0.0159605948213,-0.0863086783755,0.0770839598496,-0.109090906581,0.00196075007788,0.0114959092652, -0.0207082787704,0.00355531738666,-0.0648733510065,-0.0437623569472,0.0809910528144, -0.0348729486384,-0.042169333266,0.00687686094931,0.00771470770669,0.00937685160144, -0.0327782459879,0.112907268546,0.0707099784689,-0.0238847213388,0.0190627717508,-0.0365002751551, 0.0650844849591,-0.0474641474415,-0.0371004032624,-0.0441911423911,-0.0650795110479,0.0596349649954 ,0.0679506583803,-0.104315780363,-0.0358552103333,0.0688571108087,-0.125921249797,-0.0793501766777, -0.0764694245514,-0.0625042256718,-0.0398351968236,-0.00139784054712,-0.0992506236718, -0.0349307489162,-0.0105265111336,0.0368045413061,-0.036131176645,0.00701733106073,0.065765396244, 0.0260879924637,0.00788244857235,-0.00236672413616,0.0336365029324,0.0539302319356,-0.0723362759005 ,0.0242152291589,0.0638574725328,-0.0468681356745,0.0453092433146,0.0645030518971,0.0179481011109, 0.0822961041915,0.101609458156,-0.0258916087304,0.0642037596573,0.0292023468999,-0.083472176907, 0.0748539325129,0.0146876165954,-0.030909255995,0.065078653477,0.0733888584082,-0.0368355853722, 0.049940297623,-0.0325051954172,0.104120597229,-0.00500769944225,-0.0448897196363,-0.036414175038, 0.0349345222282,0.15395078275,-0.0840393742922,0.04540717791,-0.0183967821994,-0.0147565652948, 0.0229129220087,-0.0174109187067,0.0180155061827,-0.113339312759,-0.11662055049,0.0344210087805, -0.0368690306369,-0.0361822878699,-0.0173210452777,-0.0278091372356,-0.0549642909121, 0.0831401254593,0.0164374042349,-0.0477666984507,-0.0735370466576,-0.0335068382141,0.0567238548597, -0.0508843116563,-0.0149018378032,-0.0675674957076,0.0611645284309,0.0208751620651, -0.00623368278345,-0.0568559207764,0.0579577278531,0.0681614493045,-0.0232475461691,-0.139026819618 ,0.0638504404515,-0.0417685046331,-0.00878392708961,0.048339727318,-0.0840908285455,0.0444873473758 ,0.0425456353716,-0.0316160659207,-0.0688315551962,0.0310598454429,0.0188993187396,-0.0333886649458 ,-0.0515029632947,0.0329230039537,0.0189056647642,0.0479807481443,0.0869861593769,-0.0252765588897, 0.0200299401981,-0.0247308007767,0.0251559844229,-0.0181552902374,0.0185816744828,-0.0372676295855, 0.0386944560286,-0.0658888864518,-0.039044516465,-0.0329171724717,-0.0518072294457, 0.0179050510523 ] ) vec_king = DenseVector( [ 0.0330381329194,0.0665698704227,0.0262800220171,-0.0573233783314,0.0135350782339,0.0539792823665, -0.0321528819666,-0.0233145564785,0.00133663715962,0.02802799633,0.0941987111865,0.00248237383102, 0.0470419519499,0.103658297617,0.0284782144395,0.0231670424649,-0.00882697825621,-0.0873673438906, -0.0683468802995,-0.0462785168681,-0.0534324459156,-0.00981769171647,0.0455728191718, -0.0544375102867,-0.0192650965594,0.142169300564,0.0191646401836,0.0453715726781,0.0132457238226, 0.0238588898636,-0.0716704511245,-0.049105145688,0.119388898336,-0.0108634726252,-0.0840693043372, -0.0117917763105,0.00234220214387,0.0185465497917,0.0442281722014,0.0254097894593,0.0349079231033, 0.0927918213123,0.0419685712304,0.00143726040646,-0.0181866089236,-0.0293169083676,0.00289120791854 ,0.0412899066451,-0.00506653900628,-0.0222759510252,-0.00194271283549,-0.0767471692652, -0.0921146585665,0.00433781310415,0.0158220460582,-0.0100643271374,0.0112636294179, -0.00254645231657,-0.115208110896,-0.0190087826172,0.0234410447391,0.0215248644015, 0.000333575241208,0.0233684558297,-0.0776838165032,-0.0638970967167,-0.0249542314761, -0.0100251124392,-0.0396964549301,0.0194451504289,-0.0408672222599,-0.0939577493815, -0.0254124593962,-0.0222377375533,0.040674319319,0.00156474989335,-0.00562972882073, -0.0923975050066,-0.0037240613597,-0.0700389528087,0.0437459148492,-0.0383488042808, -0.0227029740578,0.0856896222922,-0.0528413886348,0.0869177932654,-0.108374240534,-0.0430757606878, 0.0234325343152,-0.0852679391334,0.0398711689258,0.0553487931242,0.0518913917122,0.011713013172, -0.0292461550398,0.0413067606217,-0.0264916145162,-0.0532630717936,0.0333600271866,0.0153077494632, -0.00971740221173,0.0125703966584,-0.0324509136728,-0.0187801692703,-0.021092334624, -0.0305355676905,0.00352782099769,-0.0139532571006,-0.0183539805929,-0.0374970944103, 0.0176983442134,-0.0147283731563,0.0271492533485,0.0611023402686,0.028643249914,-0.0351704112746, 0.120878889996,0.0430932821487,-0.0071095413465,0.0341006009337,-0.0237223893397,0.0256866285415, -0.0343402277703,0.0487924293288,0.0210194119725,0.00124519182086,0.0485306086417,0.0750082059892, -0.0373182086381,0.0615213534905,0.00246602046752,0.0265133077535,-0.0168888527201,0.0442101501274, 0.0203390786766,-0.0685734911937,0.0953204184258,0.0034452198249,-0.064757317011,0.0399227320822, 0.133338818015,-0.0327686361637,0.0116489346865,-0.0183264468686,0.0438325209274,-0.023651802883, 0.0995132205822,0.0206986858026,0.0342162425756,-0.0435264794104,-0.00740073133945,-0.0057937630689 ,-0.00267561051402,0.00654418220833,0.0277641731903,0.0659632941337,0.057346072795,-0.0114360072188 ,-0.0240195866907,-0.0124515844665,-0.0359702242466,-0.055722918032,-0.0463884848941, 0.0683348655834,-0.155736918654,-0.0843628305249,0.0482928173867,-0.00038997765818,-0.0406199193547 ,-0.0918284747052,-0.0667976494144,0.0372963485297,0.0383634889337,-0.0168709975171, -0.0798070838215,0.0714233150903,0.015313089337,0.030887665619,0.0320717826333,0.0237357390242, -0.00285900180471,-0.0253363661946,-0.0554786188059,0.0294742677735,-0.046435542532, -0.00314868995815,-0.186628756049,-0.0952353141872,0.182545421306,0.0334120909561,0.014159509726, -0.0694415544277,0.00423101562823,0.0165244063335,-0.0298050062068,0.0334703289547,0.0128066860739, -0.0135522659527,-0.0458431502827,-0.139522391871,-0.0334446308121,-0.0507084427953,0.0428137731297 ,-0.0329196544695,-0.0102113405379,0.00478185698452,0.0596383804931,-0.0960895271235,0.125720319948 ,0.169201911044,-0.0187761643649,0.042368894894,0.0135659493793,0.0458503257381,-0.0126451548916, -0.0886390682094,0.00128323842166,0.0174855836168,0.0806846587057,-0.090234355506,0.0810689627479, 0.0667399120289,0.0380170646212,0.00552376570008,-0.016451650553,0.0818218849532,0.0307398178632, -0.0350374150429,0.000343253762464,0.0473021039265,-0.128344033563,0.0345044288896,0.00158277196741 ,0.0237108752369,0.0361431026608,0.00922279640136,0.0827633714519,0.0621466193378,-0.0156743651735, -0.00911149340692,-0.0758977955895,-0.00623830756244,-0.0275959671657,0.00318173042727, 0.0828069247976,-0.103742066887,-0.0408753989416,0.0521663952127,-0.00092496626414,-0.0101921503664 ,-0.0297047167021,-0.0108247585402,0.00731662832716,0.0719734889624,0.0539732750085,-0.107010570263 ,-0.0747198528042,0.00864725812876,-0.0168704969039,-0.0366178508155,-0.0121999429138, 0.0788080268084,0.0223829153722,0.0517285255614,0.0268236879179,0.130153750167,-0.0144270040289, 0.0194927086799,-0.119658561963,-0.122853475203,0.0846531861626,-0.155554862331,-0.0860997913482, -0.0621898389414,-0.17728314255,0.0893194015052,0.0185023289618,-0.135035896656,0.00151218551067, 0.0556621769676,0.133543235059,-0.0642887430854,0.0156474989335,-0.00345423086193,0.0526394746568, -0.0207610955776,0.0352259793363,0.0648282372099,0.0572200851477,-0.0698799246922,-0.00263205716837 ,0.13508078497,-0.04197641417,-0.00110151584153,-0.0527783113755,0.0265091359771,-0.0877034221976, 0.00353282712937,-0.013237880883,-0.0142381059934 ] ) vec_queen = DenseVector( [ 0.0199859507483,0.1526205478,0.000618664683489,0.0165901690411,0.074417065154,0.0655692910471, -0.053041765773,-0.0687128661553,0.0255264366915,-0.0386942759625,0.0730841311552,-0.0922892844064, -0.0229324696981,0.0638844371888,-0.0150418480785,0.0189000085504,-0.0429470228238,-0.0769546175135 ,-0.0130125014973,0.00219606209615,-0.0236931981237,0.0251165219562,0.0139353627365, -0.0302018982472,-0.00408697949146,0.135300465049,0.0668865806516,-0.00476063659126, -0.0252876621483,-0.0492251656617,-0.0638152225958,-0.0311961864179,0.0566992355301,0.021559081508, -0.0473516309267,-0.0499675632819,0.11703903223,0.0025697892932,0.00441677596514,0.0769546175135, -0.00425179871621,0.0394772857982,0.0202733967404,-0.00569297928158,0.0428439910553, 0.00924504690749,-0.0506530721955,0.067472692353,-0.0232486760918,0.00612564949956,0.0445348498327, -0.10462986707,-0.0260359066633,-0.0325488734398,0.00375765709034,0.069927756088,0.00613465687809, -0.016161607452,-0.105185638128,0.0101784957671,0.0802712291041,-0.0216829724689,-0.0205973463193, 0.00489606331763,-0.0986225776848,-0.0480635298792,-0.0312830997196,-0.0945687832722, 0.0230062669924,0.0383120154595,-0.0546972271329,-0.0758915888224,-0.00866667839113, -0.0110841323702,0.00186942610617,0.0170563403863,0.0174003590366,-0.0958972925938,0.0187356633982, -0.0299860372108,-0.0212983416031,0.00892915656208,0.00700900469258,0.0946809804434, -0.0435268135578,0.00714854004776,-0.0369166619077,0.0342996233832,0.0416847256356,0.0207027484505, 0.0526122560387,0.0445416448726,0.0493632787992,-0.0214533633283,0.0302292364312,-0.0469761654636, 0.0588218163804,-0.0169416148281,0.0379850634211,0.0411926383242,-0.0581699666181,-0.0157731840057, -0.0222235732051,0.00352472944195,-0.0241168609631,0.0057759419786,0.0748854488378,0.043092563098, -0.0111934851061,-0.0778250147234,-0.00970789974477,-0.009734605832,-0.000632570811751, 0.0428188652099,0.0544333267443,-0.0271946980105,0.0852948179388,0.0416643405158,-0.0533714042225, 0.128496575749,0.0231078765432,0.0491968793326,-0.0049483693228,0.0792908470616,-0.0303152015877, 0.0316332813131,-0.0184588050264,0.101064683437,0.0119540555075,0.0208304319919,-0.0388508779296, 0.0393660367721,-0.0616641973923,-0.00849601227154,-0.0467373909204,-0.00932785158033, 0.0759282504332,0.0299090794328,0.00196234432684,0.0687089155507,0.0694321922445,0.0461394274051, -0.0353237781249,-0.0585525431695,-0.0481272136257,0.0317462686052,0.165568101357,0.0869850446195, 0.0442423470666,0.0781791469215,0.00161247878169,-0.0178734834459,0.0611762187097,0.0285407480166, -0.0262222171772,0.0767811069586,-0.00349628508868,-0.00613118034602,0.0282738451685, -0.0766406234583,0.0388758457508,0.0050253271008,0.00336654723296,0.0447516590142,-0.118273201113, -0.0591146351948,0.0141709767961,-0.0156278017557,-0.0962939332976,-0.0576453263245, 0.00461794075239,0.0248338166896,0.042656416348,0.00735823814099,-0.116470777261,0.126713114799, 0.00540853574894,-0.00261466816169,0.0717761649776,0.0596139916188,-0.054438225494,-0.0806844623473 ,0.072645297994,-0.0136325883984,-0.0536226626763,0.0188807295998,-0.0959965317818,-0.04431535424, 0.149231245084,0.0243888205852,-0.0337752991381,-0.00974803788771,0.0531040273018,-0.0498515735302, -0.0612596554793,0.0206042993834,-0.0412204505807,0.0134407470381,0.096124057299,-0.14383155869, -0.00114614941235,-0.0758740481379,0.0221914942955,-0.0283618646394,0.00667573168684, -0.00922971856157,-0.011458491664,-0.015071398601,0.0886578886397,0.116466036536,0.0829227169008, 0.0263431056786,0.0541117475282,0.0670427085462,0.0810667228504,-0.0576652373718,-0.0213899956302, -0.0312309517386,0.0512192728497,-0.0627393939457,0.0873235324233,0.106129990656,0.0314580324922, 0.00744894402306,-0.0164963026754,0.0926408882176,0.0149500360271,0.0327484579852,0.0299005461268, -0.0543593714258,-0.0530444521841,0.057176310544,-0.0461425878888,-0.0161775678946,-0.0101426242772 ,-0.0337007117228,0.0797318925614,0.115810078144,-0.0557607298966,-0.0341913768166, -0.00794830044703,-0.0359602995413,-0.0353556990102,0.00737388253529,0.0849293079994, -0.109218099276,-0.066264439436,0.0851826207676,0.0488566532627,-0.010613694372,-0.0493604343639, 0.0270755477751,0.0503324411246,0.0729935832973,0.0779000762112,-0.0777292520674,-0.0369526914218, 0.0396248803868,0.0754470667905,-0.0569478075728,-0.000792965359322,0.0110006956006, -0.0244976992485,-0.0154890565215,-0.0568800151975,0.0403171843404,0.0681994455789,0.0543968231576, -0.072004825973,-0.14427829306,0.0705822922615,-0.12334672559,-0.117944668833,0.00840135578485, -0.165265801092,0.107454075301,0.0254774491943,-0.116934104171,-0.0859812749976,0.107266184545, -0.00975767736298,-0.11928281763,-0.00672203277299,0.0101364613339,0.0463544983206,-0.086025205721, 0.0207670642938,0.0134198878457,0.136448668776,-0.0568893386245,-0.0168189880607,0.0293896539373, -0.0191718101482,0.0392143335547,0.00512172185353,0.0049556384353,-0.0369394173903, -0.00211562778609,0.0267236279156,0.0352932794572 ] ) print("'man' vector normL2 : " + str(vec_man.normL2())); print("'woman' vector normL2 : " + str(vec_woman.normL2())); print("'king' vector normL2 : " + str(vec_king.normL2())); print("'queen' vector normL2 : " + str(vec_queen.normL2())); print("'man - woman' normL2 : " + str(vec_man.minus(vec_woman).normL2())); print("'king - queen' normL2 : " + str(vec_king.minus(vec_queen).normL2())); print("(man - woman) - (king - queen) normL2 : " + str(vec_man.minus(vec_woman).minus(vec_king.minus(vec_queen)).normL2())); df = pd.DataFrame( [ ["king", str(vec_king)], ["king-man+woman", str(vec_king.minus(vec_man).plus(vec_woman))], ["queen", str(vec_queen)], ["queen-woman+man", str(vec_queen.minus(vec_woman).plus(vec_man))] ] ) target_set = BatchOperator.fromDataframe(df, schemaStr='word string, vec string') VectorNearestNeighbor()\ .setIdCol("word")\ .setSelectedCol("vec")\ .setMetric('EUCLIDEAN')\ .setOutputCol("similar_words")\ .setTopN(5)\ .fit(getWikiDependency())\ .transform(target_set)\ .select("word, similar_words")\ .print()
#c_1_3_2 vec_man = DenseVector( [ 0.37293,0.38503,0.71086,-0.65911,-0.0010128,0.92715,0.27615,-0.056203,-0.24294,0.24632,-0.18449,0.31398, 0.48983,0.09256,0.32958,0.15056,0.57317,-0.18529,-0.52277,0.46191,0.92038,0.031001,-0.16246, -0.40567,0.78621,0.57722,-0.53501,-0.68228,0.16987,0.3631,-0.071773,0.47233,0.027806,-0.14951, 0.17543,-0.37573,-0.78517,0.58171,0.86859,0.031445,-0.45897,-0.040917,0.95897,-0.16975,0.13045, 0.27434,-0.069485,0.022402,0.24977,-0.21536,-0.32406,-0.39867,0.68613,1.7923,-0.37848,-2.2477, -0.77025,0.46582,1.2411,0.57756,0.41151,0.84328,-0.54259,-0.16715,0.73927,-0.093477,0.90278,0.50889 ,-0.50031,0.26451,0.15443,-0.29432,0.10906,-0.26667,0.35438,0.049079,0.18018,-0.5859,-0.55542, -0.28987,0.74278,0.3453,-0.028757,-0.22646,-1.3113,-0.5719,-0.52306,-0.1267,-0.098678,-0.53463, 0.28607,-0.37501,0.45742,0.045975,-0.24675,0.045656,-0.38302,-0.93711,0.039138,-0.53911 ] ) vec_woman = DenseVector( [ 0.59368,0.44825,0.5932,0.074134,0.11141,1.2793,0.16656,0.2407,0.39045,0.32766,-0.75034,0.35007,0.76057, 0.38067,0.17517,0.031791,0.46849,-0.21653,-0.46282,0.39967,0.16623,-0.011477,0.044059,0.30325, 0.6153,0.47047,-0.44036,-1.5963,0.18433,0.23193,0.20452,0.51617,0.65734,-0.3452,0.23446,-0.62004, -0.68741,0.28575,1.0605,0.46916,-0.85149,0.10154,0.21426,-0.20587,0.23636,0.21321,-0.21287,0.12107, 0.18766,-0.23282,-0.25499,-0.39631,0.84379,1.6801,-0.40941,-1.9976,-0.69868,0.21732,1.2197,0.55126, 0.44095,0.72588,-0.092053,-0.022406,0.72039,0.1076,0.84116,0.30312,-0.42544,0.056362,0.13109, -0.071181,-0.10579,0.56677,0.54547,0.84113,0.14861,-0.62628,-0.68391,-1.0831,-0.088385,0.32167, 0.47794,0.091868,-1.2559,-1.2268,0.085401,0.36833,0.081566,-0.76611,0.87751,-0.22008,0.82401, -0.092207,-0.45941,0.46571,-0.56018,-0.54648,0.15162,-0.30754 ] ) vec_king = DenseVector( [ -0.32307,-0.87616,0.21977,0.25268,0.22976,0.7388,-0.37954,-0.35307,-0.84369,-1.1113,-0.30266,0.33178, -0.25113,0.30448,-0.077491,-0.89815,0.092496,-1.1407,-0.58324,0.66869,-0.23122,-0.95855,0.28262, -0.078848,0.75315,0.26584,0.3422,-0.33949,0.95608,0.065641,0.45747,0.39835,0.57965,0.39267,-0.21851 ,0.58795,-0.55999,0.63368,-0.043983,-0.68731,-0.37841,0.38026,0.61641,-0.88269,-0.12346,-0.37928, -0.38318,0.23868,0.6685,-0.43321,-0.11065,0.081723,1.1569,0.78958,-0.21223,-2.3211,-0.67806,0.44561 ,0.65707,0.1045,0.46217,0.19912,0.25802,0.057194,0.53443,-0.43133,-0.34311,0.59789,-0.58417, 0.068995,0.23944,-0.85181,0.30379,-0.34177,-0.25746,-0.031101,-0.16285,0.45169,-0.91627,0.64521, 0.73281,-0.22752,0.30226,0.044801,-0.83741,0.55006,-0.52506,-1.7357,0.4751,-0.70487,0.056939, -0.7132,0.089623,0.41394,-1.3363,-0.61915,-0.33089,-0.52881,0.16483,-0.98878 ] ) vec_queen = DenseVector( [ -0.50045,-0.70826,0.55388,0.673,0.22486,0.60281,-0.26194,0.73872,-0.65383,-0.21606,-0.33806,0.24498, -0.51497,0.8568,-0.37199,-0.58824,0.30637,-0.30668,-0.2187,0.78369,-0.61944,-0.54925,0.43067, -0.027348,0.97574,0.46169,0.11486,-0.99842,1.0661,-0.20819,0.53158,0.40922,1.0406,0.24943,0.18709, 0.41528,-0.95408,0.36822,-0.37948,-0.6802,-0.14578,-0.20113,0.17113,-0.55705,0.7191,0.070014, -0.23637,0.49534,1.1576,-0.05078,0.25731,-0.091052,1.2663,1.1047,-0.51584,-2.0033,-0.64821,0.16417, 0.32935,0.048484,0.18997,0.66116,0.080882,0.3364,0.22758,0.1462,-0.51005,0.63777,0.47299,-0.3282, 0.083899,-0.78547,0.099148,0.039176,0.27893,0.11747,0.57862,0.043639,-0.15965,-0.35304,-0.048965, -0.32461,1.4981,0.58138,-1.132,-0.60673,-0.37505,-1.1813,0.80117,-0.50014,-0.16574,-0.70584,0.43012 ,0.51051,-0.8033,-0.66572,-0.63717,-0.36032,0.13347,-0.56075 ] ) print("'man' vector normL2 : " + str(vec_man.normL2())); print("'woman' vector normL2 : " + str(vec_woman.normL2())); print("'king' vector normL2 : " + str(vec_king.normL2())); print("'queen' vector normL2 : " + str(vec_queen.normL2())); print("'man - woman' normL2 : " + str(vec_man.minus(vec_woman).normL2())); print("'king - queen' normL2 : " + str(vec_king.minus(vec_queen).normL2())); print("(man - woman) - (king - queen) normL2 : " + str(vec_man.minus(vec_woman).minus(vec_king.minus(vec_queen)).normL2())); df = pd.DataFrame( [ ["king", str(vec_king)], ["king-man+woman", str(vec_king.minus(vec_man).plus(vec_woman))], ["queen", str(vec_queen)], ["queen-woman+man", str(vec_queen.minus(vec_woman).plus(vec_man))] ] ) target_set = BatchOperator.fromDataframe(df, schemaStr='word string, vec string') VectorNearestNeighbor()\ .setIdCol("word")\ .setSelectedCol("vec")\ .setMetric('EUCLIDEAN')\ .setOutputCol("similar_words")\ .setTopN(5)\ .fit(getGlove6B100d())\ .transform(target_set)\ .select("word, similar_words")\ .print()
#c_2 source = TextSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE); source.lazyPrint(8); CHARACTER_DICT = [ "曹操", "孔明", "玄德", "刘玄德", "刘备", "关羽", "张飞", "赵云", "曹孟德", "诸葛亮", "张郃", "孙权", "张辽", "鲁肃" ] source\ .link( SegmentBatchOp()\ .setSelectedCol("text")\ .setUserDefinedDict(CHARACTER_DICT) )\ .link( StopWordsRemoverBatchOp().setSelectedCol("text") )\ .link( WordCountBatchOp().setSelectedCol("text") )\ .orderBy("cnt", 20, order = 'desc')\ .print(); if not(os.path.exists(DATA_DIR + W2V_MODEL_FILE)) : source\ .link( SegmentBatchOp()\ .setSelectedCol("text")\ .setUserDefinedDict(CHARACTER_DICT) )\ .link( StopWordsRemoverBatchOp()\ .setSelectedCol("text")\ .setStopWords([ "亦", "曰", "遂", "吾", "已", "去", "二人", "今", "使", "中", "知", "不", "见", "都", "令", "却", "欲", "请", "人", "谓", "不可", "闻", "前", "后", "皆", "便", "问", "日", "时", "耳", "不敢", "问", "回", "才", "之事", "之人", "之时", "料", "今日", "令人", "受", "说", "出", "已毕", "不得", "使人", "众", "何不", "不知", "再", "处", "无", "即日", "诸", "此时", "只", "下", "还", "上", "杀", "将军", "却说", "兵", "汝", "走", "言", "寨", "不能", "斩", "死", "商议", "听", "军士", "军", "左右", "军马", "引兵", "次日", "二", "看", "耶", "退", "更", "毕", "正", "一人", "原来", "大笑", "车胄", "口", "引", "大喜", "其事", "助", "事", "未", "大", "至此", "讫", "心中", "敢" ]) )\ .link( Word2VecTrainBatchOp()\ .setSelectedCol("text")\ .setMinCount(10)\ .setNumIter(50) )\ .link( AkSinkBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE) ); BatchOperator.execute(); word2vec = AkSourceBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE); VectorNearestNeighbor()\ .setIdCol("word")\ .setSelectedCol("vec")\ .setTopN(20)\ .setOutputCol("similar_words")\ .fit(word2vec)\ .transform( word2vec\ .filter("word IN ('曹操', '操', '玄德', '刘备', '孔明', '亮', " + "'卧龙', '周瑜', '吕布', '貂蝉', '云长', '孙权')") )\ .select("word, similar_words")\ .print();