本章包括下面各节:
22.1 单词向量预训练模型
22.1.1 加载模型
22.1.2 查找相似的单词
22.1.3 单词向量
22.2 单词映射为向量
详细内容请阅读纸质书《Alink权威指南:基于Flink的机器学习实例入门(Java)》,这里为本章对应的示例代码。
package com.alibaba.alink; import org.apache.flink.types.Row; import com.alibaba.alink.common.linalg.DenseVector; import com.alibaba.alink.common.linalg.VectorUtil; import com.alibaba.alink.operator.batch.BatchOperator; import com.alibaba.alink.operator.batch.nlp.SegmentBatchOp; import com.alibaba.alink.operator.batch.nlp.StopWordsRemoverBatchOp; import com.alibaba.alink.operator.batch.nlp.Word2VecTrainBatchOp; import com.alibaba.alink.operator.batch.nlp.WordCountBatchOp; import com.alibaba.alink.operator.batch.sink.AkSinkBatchOp; import com.alibaba.alink.operator.batch.source.AkSourceBatchOp; import com.alibaba.alink.operator.batch.source.MemSourceBatchOp; import com.alibaba.alink.operator.batch.source.TextSourceBatchOp; import com.alibaba.alink.params.shared.clustering.HasFastMetric.Metric; import com.alibaba.alink.pipeline.similarity.VectorNearestNeighbor; import java.io.File; public class Chap22 { static final String DATA_DIR = Utils.ROOT_DIR + "wordvec" + File.separator; static final String WIKI_DEPENDENCY = "deps.words"; static final String GLOVE_6B_100D = "glove.6B.100d.txt"; static final String ORIGIN_FILE = "三国演义.txt"; static final String W2V_MODEL_FILE = "w2v_model.ak"; public static void main(String[] args) throws Exception { c_1_2(); c_1_3_a(); c_1_3_b(); c_1_3_c(); c_2(); } static BatchOperator <?> getWikiDependency() { return new TextSourceBatchOp().setFilePath(DATA_DIR + WIKI_DEPENDENCY) .setTextCol("txt") .select("SUBSTRING(txt FROM 1 FOR POSITION(' ' IN txt)-1 ) AS word, " + "SUBSTRING(txt FROM POSITION(' ' IN txt) + 1 ) AS vec"); } static BatchOperator <?> getGlove6B100d() { return new TextSourceBatchOp() .setFilePath(DATA_DIR + GLOVE_6B_100D) .setTextCol("txt") .select("SUBSTRING(txt FROM 1 FOR POSITION(' ' IN txt)-1 ) AS word, " + "SUBSTRING(txt FROM POSITION(' ' IN txt) + 1 ) AS vec"); } static void c_1_2() throws Exception { BatchOperator.setParallelism(1); for (BatchOperator <?> word2vec : new BatchOperator <?>[] {getWikiDependency(), getGlove6B100d()}) { for (String metric : new String[] {"EUCLIDEAN", "COSINE"}) { new VectorNearestNeighbor() .setIdCol("word") .setSelectedCol("vec") .setMetric(metric) .setOutputCol("similar_words") .setTopN(7) .fit(word2vec) .transform(word2vec.filter("word='king'")) .select("word, similar_words") .lazyPrint(-1, metric); BatchOperator.execute(); } } } static void c_1_3_a() throws Exception { getWikiDependency() .filter("word IN ('man', 'woman', 'king', 'queen')") .lazyPrint(-1); getGlove6B100d() .filter("word IN ('man', 'woman', 'king', 'queen')") .lazyPrint(-1); BatchOperator.execute(); } static void c_1_3_b() throws Exception { DenseVector vec_man = VectorUtil.parseDense( "-0.00220404170083 0.0678135463787 0.0415827872463 -0.0145794269917 -0.0180352093149 0.144706288126 " + "-0.042149784084 -0.0216009491719 0.0638050780007 -0.0129117679223 0.151791574806 -0.0337671071952 " + "-0.0565042238321 0.075027992403 0.0542110567498 -0.0134639105297 -0.0519148101319 -0.103488516415 " + "-0.0298512060324 0.0140816291291 0.0432066082216 -0.0303133175126 -0.0883812208024 0.062041409882 " + "-0.044512874738 0.0710490513363 -0.0567426161127 -0.0569431482211 0.0664918821847 -0.00548356590169" + " 0.00485244226516 0.0314308266153 0.0779908678907 -0.0283226695098 0.0350599686745 0.0332841635629 " + "-0.0607813363988 0.024102437776 0.0405754168265 -0.0247784864016 0.0431761751645 0.0862874989335 " + "0.0128255409272 0.0690049643344 0.0415603247518 -0.0194483538295 -0.0272676568636 -0.0585464993425 " + "-0.00153650708527 0.0179571340315 -0.0639164847275 0.00511039865392 -0.0106836333846 " + "-0.0404940808584 -0.0407839194974 0.0181438988046 0.0341792215101 -0.0105126285875 0.105609229508 " + "0.0762344457381 -0.0607018119222 -0.00303642204236 -0.0812955718233 -0.0164353000274 " + "-0.047065085104 -0.0218728540451 0.11017690531 -0.0706059605168 0.0443634266897 0.00945761594138 " + "-0.0247411696768 -0.147140570395 0.0117324869596 0.0322894735835 0.0834005249394 -0.0255103289652 " + "-0.047516146486 -0.0727475057611 0.136698408827 -0.0337493545785 0.0436060420961 0.113064603901 " + "-0.0783326963356 0.126881574122 -0.0566460636161 -0.00485932593284 -0.124211797958 0.00520622655395" + " -0.0841312806081 0.004456993672 0.0160320620208 0.0449710009368 -0.0171330865509 -0.0630871839216 " + "0.0419059573289 -0.0361901582176 -0.0500458943575 0.0579496128952 -0.042589976517 0.049046313351 " + "-0.0349088902839 -0.0228651890856 0.00574351493109 -0.148883225213 0.00681972202772 " + "-0.0342051258384 0.104763806427 -0.0229291347353 -0.0633724938319 -0.0688037076294 0.0442623454644 " + "-0.00582756813642 0.0883218038814 0.0774441597578 0.0557479261333 0.0337419274634 0.0232256758929 " + "0.0116797725572 0.0445858778452 0.138361901466 0.0417927391105 0.0814153114111 0.0447528973609 " + "0.00998729605424 0.0480103213656 0.0885399074573 0.0713434187041 0.00199264064347 0.0680106366533 " + "-0.0147074994403 -0.0373772285937 -0.043165849663 0.00223882233752 0.0466046039662 0.0441302877344 " + "-0.0443967581332 0.0776084620363 0.00467093081745 -0.0657792414309 -0.0361483127641 0.0349045427043" + " 0.0420131976253 -0.00180841195853 -0.0023663513387 0.0141508281042 0.0455017679446 0.0515291435928" + " 0.0187927750578 -0.0236693101598 -0.0991280752401 -0.026072615925 -0.0035715366297 " + "-0.0356289581528 -0.00706137499303 -0.028393317678 -0.0226537880282 -0.0562448182502 " + "-0.00793795572701 0.0294197087586 -0.0522501172074 -0.0132137435543 -0.024405500303 -0.106365889505" + " -0.0284715741106 -0.112262113169 -0.027080167494 0.0371308657505 -0.0164880144299 0.0120855466518 " + "-0.0615039403557 -0.0636583471896 0.0413688501009 -0.0335770817374 0.0115250711836 0.00234189620353" + " 0.00517325740876 -0.0318260940593 -0.0752989915305 0.0732786350672 0.0173156848935 0.0220239324357" + " 0.0362946812768 -0.0878689310079 0.133256756137 -0.0165050424499 0.00116569477643 -0.0684093459311" + " -0.0816989909791 0.0488830979674 0.045251238671 -0.0607186587931 -0.0260135613023 0.0855552940716 " + "-0.12988285323 -0.0763797273559 -0.0939136969744 -0.0742133646781 0.0111490055494 -0.00542288093664" + " -0.0472696024937 -0.00213194433937 -0.00409415192574 0.025322114999 -0.0282545574296 " + "0.0245714329238 -0.00910890382877 0.0557037257409 0.0365459351471 0.0267329045746 0.0915252454395 " + "0.120522333232 -0.0476376975653 0.0422655383904 0.0116381082528 -0.020368048061 0.0774028577517 " + "0.172846359293 0.0562408329689 0.0931919987632 0.0471446095806 -0.0731395125204 0.0468344822368 " + "0.0484217110839 -0.073229905946 0.050022344968 0.0151862766422 -0.0681736708878 0.0626352167938 " + "0.0696656152823 -0.0243629302529 0.00182996870731 0.00116949890857 0.159671381657 0.0205299954006 " + "0.0500475246998 0.0362838123279 -0.0326323889233 0.079122506627 -0.0585609912745 0.0344496771901 " + "0.0229575751518 0.00719017203825 0.0602706769466 0.01418578989 -0.0499381106136 -0.123776315403 " + "-0.0922469436507 0.0141167720641 -0.00585727659692 -0.0237772750528 0.0728150743939 " + "-0.0151895373269 -0.00172906863109 0.0540235673801 0.0352646672133 -0.0869593811287 " + "-0.0638471046033 -0.0194309635111 0.0705387541824 -0.0673945483962 0.0364571720639 -0.0613354716468" + " 0.0492368822562 -0.0462057135392 -0.0107028351944 0.0119197951801 0.0393289295294 0.048293276337 " + "-0.0136312923437 -0.179455586009 0.052450830465 -0.00182326618878 0.0396665915439 0.00621885029914 " + "-0.0347118000093 0.0515389256469 0.0218607170521 -0.0478699307748 -0.0394441403884 -0.0236352541197" + " -0.00943768953494 -0.028646745338 -0.0799038029084 0.0192313371485 0.054759395225 0.0405284991968 " + "0.0752306983012 0.0205116993365 0.0747990198782 -0.0646151769968 0.00833883879464 0.050741325942 " + "0.0133485185215 -0.0694659889196 0.0447963731568 -0.000302881377808 -0.0743483207944 " + "-0.0796594327059 -0.0164630158472 -0.0441532936764"); DenseVector vec_woman = VectorUtil.parseDense( "-0.0923953735088 0.062354665309 -0.0682686456655 0.048195826923 -0.0095468221514 0.105997991417 " + "-0.014845581153 0.0151306377161 0.072575366664 0.00120951798016 0.149595694754 -0.0258451283883 " + "0.0146608603837 0.0479136861009 -0.00676829247491 -0.0733037873761 -0.113669992093 -0.0400013940617" + " 0.0148815991303 0.0490117198657 0.0486700636239 0.0238102841857 -0.0545149237669 0.068213075072 " + "-0.0436013051345 0.133555174326 -0.0357794010668 -0.0117240231214 0.0670014989217 -0.0148994366047 " + "-0.010707458591 0.00960856725532 0.0514741489129 -0.0597749205643 0.0573469658668 0.00388170886656 " + "-0.0786130087425 0.0356509369478 0.0715052897102 0.0058419444019 -0.0451022257023 0.0326794538216 " + "0.0290673652421 0.0376176900219 -0.00313013374049 -0.0558396992744 -0.0120471558319 " + "0.00773083003938 0.0160659045263 0.0284042914317 -0.0337920662914 -0.00866609684962 " + "-0.00168701345049 -0.0280652079027 -0.0222076557106 0.0599160767324 -0.00770630351199 " + "-0.0667687541827 0.144065220098 0.0177952819787 -0.00562394988222 -0.02769559485 -0.108722151099 " + "-0.0929776641416 -0.109614367851 -0.0185979683296 0.0672939305944 -0.0590288338919 0.0495305502528 " + "0.00501318789593 -0.0188578123087 -0.129812220429 -0.0227613034758 0.0181184146892 0.0540226780773 " + "-0.0324755234645 -0.0495677688293 -0.1263689018 0.14630982614 -0.0534362711042 0.0445444615969 " + "0.0700842947491 -0.180258827904 0.0878787191569 -0.0861760979163 -0.0857423385612 -0.102763748571 " + "0.025485463158 -0.0720994148213 0.0178841263226 0.000425698188706 -0.0197947942607 0.024358271983 " + "-0.109326738575 -0.039054121259 -0.021232426097 -0.0176808820222 0.0184684751256 -0.00301950709596 " + "0.0458568880836 -0.0280156403053 -0.0652055739684 -0.0428989545774 -0.124810523983 " + "-0.000185749854299 -0.0701319756905 0.0403401345623 -0.0182767222751 -0.0878262358186 " + "-0.0598584479688 0.0647704425 -0.0495142564059 0.0499855773658 0.0380457894091 0.0419012566065 " + "-0.014114416214 0.0751813530779 -0.0190830104238 -0.00171359814801 0.082342927562 0.0802993361363 " + "0.134567107973 0.0650849995016 0.0141840509701 -0.00596440552468 0.0295458897975 0.0468542430261 " + "0.0334896867964 0.0427727201427 0.00837469426195 -0.0867474116417 -0.0196050995803 0.0606614773481 " + "0.0577649459175 -0.00796528992011 -0.00462316465616 0.133496859505 0.0233127215566 -0.0632115501401" + " -0.0167939822101 0.0507690541289 0.0441614704384 -0.0453769914148 0.0339716416353 -0.0274534168313" + " 0.0892136140009 0.0720007941692 0.0383738960308 -0.065645679348 -0.0230317813338 0.0215128517773 " + "-0.0497991414548 -0.0185619503524 -0.00282655364621 0.0283879975848 -0.0389481254973 " + "-0.0275192782755 0.0505514026376 0.0402641537817 -0.0335474870742 -0.00767920427194 " + "-0.0159605948213 -0.0863086783755 0.0770839598496 -0.109090906581 0.00196075007788 0.0114959092652 " + "-0.0207082787704 0.00355531738666 -0.0648733510065 -0.0437623569472 0.0809910528144 " + "-0.0348729486384 -0.042169333266 0.00687686094931 0.00771470770669 0.00937685160144 " + "-0.0327782459879 0.112907268546 0.0707099784689 -0.0238847213388 0.0190627717508 -0.0365002751551 " + "0.0650844849591 -0.0474641474415 -0.0371004032624 -0.0441911423911 -0.0650795110479 0.0596349649954" + " 0.0679506583803 -0.104315780363 -0.0358552103333 0.0688571108087 -0.125921249797 -0.0793501766777 " + "-0.0764694245514 -0.0625042256718 -0.0398351968236 -0.00139784054712 -0.0992506236718 " + "-0.0349307489162 -0.0105265111336 0.0368045413061 -0.036131176645 0.00701733106073 0.065765396244 " + "0.0260879924637 0.00788244857235 -0.00236672413616 0.0336365029324 0.0539302319356 -0.0723362759005" + " 0.0242152291589 0.0638574725328 -0.0468681356745 0.0453092433146 0.0645030518971 0.0179481011109 " + "0.0822961041915 0.101609458156 -0.0258916087304 0.0642037596573 0.0292023468999 -0.083472176907 " + "0.0748539325129 0.0146876165954 -0.030909255995 0.065078653477 0.0733888584082 -0.0368355853722 " + "0.049940297623 -0.0325051954172 0.104120597229 -0.00500769944225 -0.0448897196363 -0.036414175038 " + "0.0349345222282 0.15395078275 -0.0840393742922 0.04540717791 -0.0183967821994 -0.0147565652948 " + "0.0229129220087 -0.0174109187067 0.0180155061827 -0.113339312759 -0.11662055049 0.0344210087805 " + "-0.0368690306369 -0.0361822878699 -0.0173210452777 -0.0278091372356 -0.0549642909121 " + "0.0831401254593 0.0164374042349 -0.0477666984507 -0.0735370466576 -0.0335068382141 0.0567238548597 " + "-0.0508843116563 -0.0149018378032 -0.0675674957076 0.0611645284309 0.0208751620651 " + "-0.00623368278345 -0.0568559207764 0.0579577278531 0.0681614493045 -0.0232475461691 -0.139026819618" + " 0.0638504404515 -0.0417685046331 -0.00878392708961 0.048339727318 -0.0840908285455 0.0444873473758" + " 0.0425456353716 -0.0316160659207 -0.0688315551962 0.0310598454429 0.0188993187396 -0.0333886649458" + " -0.0515029632947 0.0329230039537 0.0189056647642 0.0479807481443 0.0869861593769 -0.0252765588897 " + "0.0200299401981 -0.0247308007767 0.0251559844229 -0.0181552902374 0.0185816744828 -0.0372676295855 " + "0.0386944560286 -0.0658888864518 -0.039044516465 -0.0329171724717 -0.0518072294457 " + "0.0179050510523"); DenseVector vec_king = VectorUtil.parseDense( "0.0330381329194 0.0665698704227 0.0262800220171 -0.0573233783314 0.0135350782339 0.0539792823665 " + "-0.0321528819666 -0.0233145564785 0.00133663715962 0.02802799633 0.0941987111865 0.00248237383102 " + "0.0470419519499 0.103658297617 0.0284782144395 0.0231670424649 -0.00882697825621 -0.0873673438906 " + "-0.0683468802995 -0.0462785168681 -0.0534324459156 -0.00981769171647 0.0455728191718 " + "-0.0544375102867 -0.0192650965594 0.142169300564 0.0191646401836 0.0453715726781 0.0132457238226 " + "0.0238588898636 -0.0716704511245 -0.049105145688 0.119388898336 -0.0108634726252 -0.0840693043372 " + "-0.0117917763105 0.00234220214387 0.0185465497917 0.0442281722014 0.0254097894593 0.0349079231033 " + "0.0927918213123 0.0419685712304 0.00143726040646 -0.0181866089236 -0.0293169083676 0.00289120791854" + " 0.0412899066451 -0.00506653900628 -0.0222759510252 -0.00194271283549 -0.0767471692652 " + "-0.0921146585665 0.00433781310415 0.0158220460582 -0.0100643271374 0.0112636294179 " + "-0.00254645231657 -0.115208110896 -0.0190087826172 0.0234410447391 0.0215248644015 " + "0.000333575241208 0.0233684558297 -0.0776838165032 -0.0638970967167 -0.0249542314761 " + "-0.0100251124392 -0.0396964549301 0.0194451504289 -0.0408672222599 -0.0939577493815 " + "-0.0254124593962 -0.0222377375533 0.040674319319 0.00156474989335 -0.00562972882073 " + "-0.0923975050066 -0.0037240613597 -0.0700389528087 0.0437459148492 -0.0383488042808 " + "-0.0227029740578 0.0856896222922 -0.0528413886348 0.0869177932654 -0.108374240534 -0.0430757606878 " + "0.0234325343152 -0.0852679391334 0.0398711689258 0.0553487931242 0.0518913917122 0.011713013172 " + "-0.0292461550398 0.0413067606217 -0.0264916145162 -0.0532630717936 0.0333600271866 0.0153077494632 " + "-0.00971740221173 0.0125703966584 -0.0324509136728 -0.0187801692703 -0.021092334624 " + "-0.0305355676905 0.00352782099769 -0.0139532571006 -0.0183539805929 -0.0374970944103 " + "0.0176983442134 -0.0147283731563 0.0271492533485 0.0611023402686 0.028643249914 -0.0351704112746 " + "0.120878889996 0.0430932821487 -0.0071095413465 0.0341006009337 -0.0237223893397 0.0256866285415 " + "-0.0343402277703 0.0487924293288 0.0210194119725 0.00124519182086 0.0485306086417 0.0750082059892 " + "-0.0373182086381 0.0615213534905 0.00246602046752 0.0265133077535 -0.0168888527201 0.0442101501274 " + "0.0203390786766 -0.0685734911937 0.0953204184258 0.0034452198249 -0.064757317011 0.0399227320822 " + "0.133338818015 -0.0327686361637 0.0116489346865 -0.0183264468686 0.0438325209274 -0.023651802883 " + "0.0995132205822 0.0206986858026 0.0342162425756 -0.0435264794104 -0.00740073133945 -0.0057937630689" + " -0.00267561051402 0.00654418220833 0.0277641731903 0.0659632941337 0.057346072795 -0.0114360072188" + " -0.0240195866907 -0.0124515844665 -0.0359702242466 -0.055722918032 -0.0463884848941 " + "0.0683348655834 -0.155736918654 -0.0843628305249 0.0482928173867 -0.00038997765818 -0.0406199193547" + " -0.0918284747052 -0.0667976494144 0.0372963485297 0.0383634889337 -0.0168709975171 " + "-0.0798070838215 0.0714233150903 0.015313089337 0.030887665619 0.0320717826333 0.0237357390242 " + "-0.00285900180471 -0.0253363661946 -0.0554786188059 0.0294742677735 -0.046435542532 " + "-0.00314868995815 -0.186628756049 -0.0952353141872 0.182545421306 0.0334120909561 0.014159509726 " + "-0.0694415544277 0.00423101562823 0.0165244063335 -0.0298050062068 0.0334703289547 0.0128066860739 " + "-0.0135522659527 -0.0458431502827 -0.139522391871 -0.0334446308121 -0.0507084427953 0.0428137731297" + " -0.0329196544695 -0.0102113405379 0.00478185698452 0.0596383804931 -0.0960895271235 0.125720319948" + " 0.169201911044 -0.0187761643649 0.042368894894 0.0135659493793 0.0458503257381 -0.0126451548916 " + "-0.0886390682094 0.00128323842166 0.0174855836168 0.0806846587057 -0.090234355506 0.0810689627479 " + "0.0667399120289 0.0380170646212 0.00552376570008 -0.016451650553 0.0818218849532 0.0307398178632 " + "-0.0350374150429 0.000343253762464 0.0473021039265 -0.128344033563 0.0345044288896 0.00158277196741" + " 0.0237108752369 0.0361431026608 0.00922279640136 0.0827633714519 0.0621466193378 -0.0156743651735 " + "-0.00911149340692 -0.0758977955895 -0.00623830756244 -0.0275959671657 0.00318173042727 " + "0.0828069247976 -0.103742066887 -0.0408753989416 0.0521663952127 -0.00092496626414 -0.0101921503664" + " -0.0297047167021 -0.0108247585402 0.00731662832716 0.0719734889624 0.0539732750085 -0.107010570263" + " -0.0747198528042 0.00864725812876 -0.0168704969039 -0.0366178508155 -0.0121999429138 " + "0.0788080268084 0.0223829153722 0.0517285255614 0.0268236879179 0.130153750167 -0.0144270040289 " + "0.0194927086799 -0.119658561963 -0.122853475203 0.0846531861626 -0.155554862331 -0.0860997913482 " + "-0.0621898389414 -0.17728314255 0.0893194015052 0.0185023289618 -0.135035896656 0.00151218551067 " + "0.0556621769676 0.133543235059 -0.0642887430854 0.0156474989335 -0.00345423086193 0.0526394746568 " + "-0.0207610955776 0.0352259793363 0.0648282372099 0.0572200851477 -0.0698799246922 -0.00263205716837" + " 0.13508078497 -0.04197641417 -0.00110151584153 -0.0527783113755 0.0265091359771 -0.0877034221976 " + "0.00353282712937 -0.013237880883 -0.0142381059934"); DenseVector vec_queen = VectorUtil.parseDense( "0.0199859507483 0.1526205478 0.000618664683489 0.0165901690411 0.074417065154 0.0655692910471 " + "-0.053041765773 -0.0687128661553 0.0255264366915 -0.0386942759625 0.0730841311552 -0.0922892844064 " + "-0.0229324696981 0.0638844371888 -0.0150418480785 0.0189000085504 -0.0429470228238 -0.0769546175135" + " -0.0130125014973 0.00219606209615 -0.0236931981237 0.0251165219562 0.0139353627365 " + "-0.0302018982472 -0.00408697949146 0.135300465049 0.0668865806516 -0.00476063659126 " + "-0.0252876621483 -0.0492251656617 -0.0638152225958 -0.0311961864179 0.0566992355301 0.021559081508 " + "-0.0473516309267 -0.0499675632819 0.11703903223 0.0025697892932 0.00441677596514 0.0769546175135 " + "-0.00425179871621 0.0394772857982 0.0202733967404 -0.00569297928158 0.0428439910553 " + "0.00924504690749 -0.0506530721955 0.067472692353 -0.0232486760918 0.00612564949956 0.0445348498327 " + "-0.10462986707 -0.0260359066633 -0.0325488734398 0.00375765709034 0.069927756088 0.00613465687809 " + "-0.016161607452 -0.105185638128 0.0101784957671 0.0802712291041 -0.0216829724689 -0.0205973463193 " + "0.00489606331763 -0.0986225776848 -0.0480635298792 -0.0312830997196 -0.0945687832722 " + "0.0230062669924 0.0383120154595 -0.0546972271329 -0.0758915888224 -0.00866667839113 " + "-0.0110841323702 0.00186942610617 0.0170563403863 0.0174003590366 -0.0958972925938 0.0187356633982 " + "-0.0299860372108 -0.0212983416031 0.00892915656208 0.00700900469258 0.0946809804434 " + "-0.0435268135578 0.00714854004776 -0.0369166619077 0.0342996233832 0.0416847256356 0.0207027484505 " + "0.0526122560387 0.0445416448726 0.0493632787992 -0.0214533633283 0.0302292364312 -0.0469761654636 " + "0.0588218163804 -0.0169416148281 0.0379850634211 0.0411926383242 -0.0581699666181 -0.0157731840057 " + "-0.0222235732051 0.00352472944195 -0.0241168609631 0.0057759419786 0.0748854488378 0.043092563098 " + "-0.0111934851061 -0.0778250147234 -0.00970789974477 -0.009734605832 -0.000632570811751 " + "0.0428188652099 0.0544333267443 -0.0271946980105 0.0852948179388 0.0416643405158 -0.0533714042225 " + "0.128496575749 0.0231078765432 0.0491968793326 -0.0049483693228 0.0792908470616 -0.0303152015877 " + "0.0316332813131 -0.0184588050264 0.101064683437 0.0119540555075 0.0208304319919 -0.0388508779296 " + "0.0393660367721 -0.0616641973923 -0.00849601227154 -0.0467373909204 -0.00932785158033 " + "0.0759282504332 0.0299090794328 0.00196234432684 0.0687089155507 0.0694321922445 0.0461394274051 " + "-0.0353237781249 -0.0585525431695 -0.0481272136257 0.0317462686052 0.165568101357 0.0869850446195 " + "0.0442423470666 0.0781791469215 0.00161247878169 -0.0178734834459 0.0611762187097 0.0285407480166 " + "-0.0262222171772 0.0767811069586 -0.00349628508868 -0.00613118034602 0.0282738451685 " + "-0.0766406234583 0.0388758457508 0.0050253271008 0.00336654723296 0.0447516590142 -0.118273201113 " + "-0.0591146351948 0.0141709767961 -0.0156278017557 -0.0962939332976 -0.0576453263245 " + "0.00461794075239 0.0248338166896 0.042656416348 0.00735823814099 -0.116470777261 0.126713114799 " + "0.00540853574894 -0.00261466816169 0.0717761649776 0.0596139916188 -0.054438225494 -0.0806844623473" + " 0.072645297994 -0.0136325883984 -0.0536226626763 0.0188807295998 -0.0959965317818 -0.04431535424 " + "0.149231245084 0.0243888205852 -0.0337752991381 -0.00974803788771 0.0531040273018 -0.0498515735302 " + "-0.0612596554793 0.0206042993834 -0.0412204505807 0.0134407470381 0.096124057299 -0.14383155869 " + "-0.00114614941235 -0.0758740481379 0.0221914942955 -0.0283618646394 0.00667573168684 " + "-0.00922971856157 -0.011458491664 -0.015071398601 0.0886578886397 0.116466036536 0.0829227169008 " + "0.0263431056786 0.0541117475282 0.0670427085462 0.0810667228504 -0.0576652373718 -0.0213899956302 " + "-0.0312309517386 0.0512192728497 -0.0627393939457 0.0873235324233 0.106129990656 0.0314580324922 " + "0.00744894402306 -0.0164963026754 0.0926408882176 0.0149500360271 0.0327484579852 0.0299005461268 " + "-0.0543593714258 -0.0530444521841 0.057176310544 -0.0461425878888 -0.0161775678946 -0.0101426242772" + " -0.0337007117228 0.0797318925614 0.115810078144 -0.0557607298966 -0.0341913768166 " + "-0.00794830044703 -0.0359602995413 -0.0353556990102 0.00737388253529 0.0849293079994 " + "-0.109218099276 -0.066264439436 0.0851826207676 0.0488566532627 -0.010613694372 -0.0493604343639 " + "0.0270755477751 0.0503324411246 0.0729935832973 0.0779000762112 -0.0777292520674 -0.0369526914218 " + "0.0396248803868 0.0754470667905 -0.0569478075728 -0.000792965359322 0.0110006956006 " + "-0.0244976992485 -0.0154890565215 -0.0568800151975 0.0403171843404 0.0681994455789 0.0543968231576 " + "-0.072004825973 -0.14427829306 0.0705822922615 -0.12334672559 -0.117944668833 0.00840135578485 " + "-0.165265801092 0.107454075301 0.0254774491943 -0.116934104171 -0.0859812749976 0.107266184545 " + "-0.00975767736298 -0.11928281763 -0.00672203277299 0.0101364613339 0.0463544983206 -0.086025205721 " + "0.0207670642938 0.0134198878457 0.136448668776 -0.0568893386245 -0.0168189880607 0.0293896539373 " + "-0.0191718101482 0.0392143335547 0.00512172185353 0.0049556384353 -0.0369394173903 " + "-0.00211562778609 0.0267236279156 0.0352932794572"); System.out.println("'man' vector normL2 : " + vec_man.normL2()); System.out.println("'woman' vector normL2 : " + vec_woman.normL2()); System.out.println("'king' vector normL2 : " + vec_king.normL2()); System.out.println("'queen' vector normL2 : " + vec_queen.normL2()); System.out.println("'man - woman' normL2 : " + vec_man.minus(vec_woman).normL2()); System.out.println("'king - queen' normL2 : " + vec_king.minus(vec_queen).normL2()); System.out.println("(man - woman) - (king - queen) normL2 : " + vec_man.minus(vec_woman).minus(vec_king.minus(vec_queen)).normL2()); BatchOperator.setParallelism(1); new VectorNearestNeighbor() .setIdCol("word") .setSelectedCol("vec") .setMetric(Metric.EUCLIDEAN) .setOutputCol("similar_words") .setTopN(5) .fit( getWikiDependency() ) .transform( new MemSourceBatchOp( new Row[] { Row.of("king", vec_king), Row.of("king-man+woman", vec_king.minus(vec_man).plus(vec_woman)), Row.of("queen", vec_queen), Row.of("queen-woman+man", vec_queen.minus(vec_woman).plus(vec_man)), }, new String[] {"word", "vec"} ) ) .select("word, similar_words") .print(); } static void c_1_3_c() throws Exception { DenseVector vec_man = VectorUtil.parseDense( "0.37293 0.38503 0.71086 -0.65911 -0.0010128 0.92715 0.27615 -0.056203 -0.24294 0.24632 -0.18449 0.31398 " + "0.48983 0.09256 0.32958 0.15056 0.57317 -0.18529 -0.52277 0.46191 0.92038 0.031001 -0.16246 " + "-0.40567 0.78621 0.57722 -0.53501 -0.68228 0.16987 0.3631 -0.071773 0.47233 0.027806 -0.14951 " + "0.17543 -0.37573 -0.78517 0.58171 0.86859 0.031445 -0.45897 -0.040917 0.95897 -0.16975 0.13045 " + "0.27434 -0.069485 0.022402 0.24977 -0.21536 -0.32406 -0.39867 0.68613 1.7923 -0.37848 -2.2477 " + "-0.77025 0.46582 1.2411 0.57756 0.41151 0.84328 -0.54259 -0.16715 0.73927 -0.093477 0.90278 0.50889" + " -0.50031 0.26451 0.15443 -0.29432 0.10906 -0.26667 0.35438 0.049079 0.18018 -0.5859 -0.55542 " + "-0.28987 0.74278 0.3453 -0.028757 -0.22646 -1.3113 -0.5719 -0.52306 -0.1267 -0.098678 -0.53463 " + "0.28607 -0.37501 0.45742 0.045975 -0.24675 0.045656 -0.38302 -0.93711 0.039138 -0.53911"); DenseVector vec_woman = VectorUtil.parseDense( "0.59368 0.44825 0.5932 0.074134 0.11141 1.2793 0.16656 0.2407 0.39045 0.32766 -0.75034 0.35007 0.76057 " + "0.38067 0.17517 0.031791 0.46849 -0.21653 -0.46282 0.39967 0.16623 -0.011477 0.044059 0.30325 " + "0.6153 0.47047 -0.44036 -1.5963 0.18433 0.23193 0.20452 0.51617 0.65734 -0.3452 0.23446 -0.62004 " + "-0.68741 0.28575 1.0605 0.46916 -0.85149 0.10154 0.21426 -0.20587 0.23636 0.21321 -0.21287 0.12107 " + "0.18766 -0.23282 -0.25499 -0.39631 0.84379 1.6801 -0.40941 -1.9976 -0.69868 0.21732 1.2197 0.55126 " + "0.44095 0.72588 -0.092053 -0.022406 0.72039 0.1076 0.84116 0.30312 -0.42544 0.056362 0.13109 " + "-0.071181 -0.10579 0.56677 0.54547 0.84113 0.14861 -0.62628 -0.68391 -1.0831 -0.088385 0.32167 " + "0.47794 0.091868 -1.2559 -1.2268 0.085401 0.36833 0.081566 -0.76611 0.87751 -0.22008 0.82401 " + "-0.092207 -0.45941 0.46571 -0.56018 -0.54648 0.15162 -0.30754"); DenseVector vec_king = VectorUtil.parseDense( "-0.32307 -0.87616 0.21977 0.25268 0.22976 0.7388 -0.37954 -0.35307 -0.84369 -1.1113 -0.30266 0.33178 " + "-0.25113 0.30448 -0.077491 -0.89815 0.092496 -1.1407 -0.58324 0.66869 -0.23122 -0.95855 0.28262 " + "-0.078848 0.75315 0.26584 0.3422 -0.33949 0.95608 0.065641 0.45747 0.39835 0.57965 0.39267 -0.21851" + " 0.58795 -0.55999 0.63368 -0.043983 -0.68731 -0.37841 0.38026 0.61641 -0.88269 -0.12346 -0.37928 " + "-0.38318 0.23868 0.6685 -0.43321 -0.11065 0.081723 1.1569 0.78958 -0.21223 -2.3211 -0.67806 0.44561" + " 0.65707 0.1045 0.46217 0.19912 0.25802 0.057194 0.53443 -0.43133 -0.34311 0.59789 -0.58417 " + "0.068995 0.23944 -0.85181 0.30379 -0.34177 -0.25746 -0.031101 -0.16285 0.45169 -0.91627 0.64521 " + "0.73281 -0.22752 0.30226 0.044801 -0.83741 0.55006 -0.52506 -1.7357 0.4751 -0.70487 0.056939 " + "-0.7132 0.089623 0.41394 -1.3363 -0.61915 -0.33089 -0.52881 0.16483 -0.98878"); DenseVector vec_queen = VectorUtil.parseDense( "-0.50045 -0.70826 0.55388 0.673 0.22486 0.60281 -0.26194 0.73872 -0.65383 -0.21606 -0.33806 0.24498 " + "-0.51497 0.8568 -0.37199 -0.58824 0.30637 -0.30668 -0.2187 0.78369 -0.61944 -0.54925 0.43067 " + "-0.027348 0.97574 0.46169 0.11486 -0.99842 1.0661 -0.20819 0.53158 0.40922 1.0406 0.24943 0.18709 " + "0.41528 -0.95408 0.36822 -0.37948 -0.6802 -0.14578 -0.20113 0.17113 -0.55705 0.7191 0.070014 " + "-0.23637 0.49534 1.1576 -0.05078 0.25731 -0.091052 1.2663 1.1047 -0.51584 -2.0033 -0.64821 0.16417 " + "0.32935 0.048484 0.18997 0.66116 0.080882 0.3364 0.22758 0.1462 -0.51005 0.63777 0.47299 -0.3282 " + "0.083899 -0.78547 0.099148 0.039176 0.27893 0.11747 0.57862 0.043639 -0.15965 -0.35304 -0.048965 " + "-0.32461 1.4981 0.58138 -1.132 -0.60673 -0.37505 -1.1813 0.80117 -0.50014 -0.16574 -0.70584 0.43012" + " 0.51051 -0.8033 -0.66572 -0.63717 -0.36032 0.13347 -0.56075"); System.out.println("'man' vector normL2 : " + vec_man.normL2()); System.out.println("'woman' vector normL2 : " + vec_woman.normL2()); System.out.println("'king' vector normL2 : " + vec_king.normL2()); System.out.println("'queen' vector normL2 : " + vec_queen.normL2()); System.out.println("'man - woman' normL2 : " + vec_man.minus(vec_woman).normL2()); System.out.println("'king - queen' normL2 : " + vec_king.minus(vec_queen).normL2()); System.out.println("(man - woman) - (king - queen) normL2 : " + vec_man.minus(vec_woman).minus(vec_king.minus(vec_queen)).normL2()); BatchOperator.setParallelism(1); new VectorNearestNeighbor() .setIdCol("word") .setSelectedCol("vec") .setMetric(Metric.EUCLIDEAN) .setOutputCol("similar_words") .setTopN(5) .fit( getGlove6B100d() ) .transform( new MemSourceBatchOp( new Row[] { Row.of("king", vec_king), Row.of("king-man+woman", vec_king.minus(vec_man).plus(vec_woman)), Row.of("queen", vec_queen), Row.of("queen-woman+man", vec_queen.minus(vec_woman).plus(vec_man)), }, new String[] {"word", "vec"} ) ) .select("word, similar_words") .print(); } static void c_2() throws Exception { BatchOperator.setParallelism(1); TextSourceBatchOp source = new TextSourceBatchOp().setFilePath(DATA_DIR + ORIGIN_FILE); source.lazyPrint(8); final String[] CHARACTER_DICT = new String[] { "曹操", "孔明", "玄德", "刘玄德", "刘备", "关羽", "张飞", "赵云", "曹孟德", "诸葛亮", "张郃", "孙权", "张辽", "鲁肃" }; source .link( new SegmentBatchOp() .setSelectedCol("text") .setUserDefinedDict(CHARACTER_DICT) ) .link( new StopWordsRemoverBatchOp() .setSelectedCol("text") ) .link( new WordCountBatchOp() .setSelectedCol("text") ) .orderBy("cnt", 100, false) .print(); if (!new File(DATA_DIR + W2V_MODEL_FILE).exists()) { source .link( new SegmentBatchOp() .setSelectedCol("text") .setUserDefinedDict(CHARACTER_DICT) ) .link( new StopWordsRemoverBatchOp() .setSelectedCol("text") .setStopWords( "亦", "曰", "遂", "吾", "已", "去", "二人", "今", "使", "中", "知", "不", "见", "都", "令", "却", "欲", "请", "人", "谓", "不可", "闻", "前", "后", "皆", "便", "问", "日", "时", "耳", "不敢", "问", "回", "才", "之事", "之人", "之时", "料", "今日", "令人", "受", "说", "出", "已毕", "不得", "使人", "众", "何不", "不知", "再", "处", "无", "即日", "诸", "此时", "只", "下", "还", "上", "杀", "将军", "却说", "兵", "汝", "走", "言", "寨", "不能", "斩", "死", "商议", "听", "军士", "军", "左右", "军马", "引兵", "次日", "二", "看", "耶", "退", "更", "毕", "正", "一人", "原来", "大笑", "车胄", "口", "引", "大喜", "其事", "助", "事", "未", "大", "至此", "讫", "心中", "敢" ) ) .link( new Word2VecTrainBatchOp() .setSelectedCol("text") .setMinCount(10) .setNumIter(50) ) .link( new AkSinkBatchOp() .setFilePath(DATA_DIR + W2V_MODEL_FILE) ); BatchOperator.execute(); } AkSourceBatchOp word2vec = new AkSourceBatchOp().setFilePath(DATA_DIR + W2V_MODEL_FILE); new VectorNearestNeighbor() .setIdCol("word") .setSelectedCol("vec") .setTopN(20) .setOutputCol("similar_words") .fit(word2vec) .transform( word2vec.filter("word IN ('曹操', '操', '玄德', '刘备', '孔明', " + "'亮', '卧龙', '周瑜', '吕布', '貂蝉', '云长', '孙权')") ) .select("word, similar_words") .print(); } }