SET default_parallel 10; -- load data fgPhrases1 = LOAD 'phrases/data/dkos-phraseFreq-5/' AS (xy,c:int); fgPhrases = FOREACH fgPhrases1 GENERATE STRSPLIT(xy,' ') AS xy:(x,y), c AS c; -- fgPhrases: {tuple_0: (x: bytearray,y: bytearray),c: int} -- STORE fgPhrases INTO 'phrases/data/fgPhrases'; bgPhrases1 = LOAD 'phrases/data/brown-phraseFreq-5/' AS (xy,c:int); bgPhrases = FOREACH bgPhrases1 GENERATE STRSPLIT(xy,' ') AS xy:(x,y), c AS c; -- compute word frequencies fgWordFreq1 = GROUP fgPhrases BY xy.x; fgWordFreq = FOREACH fgWordFreq1 GENERATE group as w,SUM(fgPhrases.c) as c; -- STORE fgWordFreq INTO 'phrases/data/fgWordFreq'; bgWordFreq1 = GROUP bgPhrases BY xy.x; bgWordFreq = FOREACH bgWordFreq1 GENERATE group as w,SUM(bgPhrases.c) as c; -- STORE bgWordFreq INTO 'phrases/data/bgWordFreq'; -- join in phrase stats, and then clean up phraseStats1 = JOIN fgPhrases BY xy, bgPhrases BY xy; STORE phraseStats1 INTO 'phrases/data/phraseStats1'; phraseStats2 = FOREACH phraseStats1 GENERATE fgPhrases::xy AS xy, fgPhrases::c AS fC, bgPhrases::c AS bC; -- STORE phraseStats2 INTO 'phrases/data/phraseStats2'; -- join in word freqs for x and clean up phraseStats3 = JOIN fgWordFreq BY w, bgWordFreq BY w, phraseStats2 by xy.x; phraseStats4 = FOREACH phraseStats3 GENERATE xy,fC,bC,fgWordFreq::c as fxC,bgWordFreq::c as bxC; -- STORE phraseStats4 INTO 'phrases/data/phraseStats4'; -- join in word freqs for y and clean up phraseStats5 = JOIN fgWordFreq BY w, bgWordFreq BY w, phraseStats4 by xy.y; phraseStats6 = FOREACH phraseStats5 GENERATE xy,fC,bC,fxC,bxC,fgWordFreq::c as fyC,bgWordFreq::c as byC; -- STORE phraseStats6 INTO 'phrases/data/phraseStats6'; -- compute totals fgPhraseCount1 = group fgPhrases1 ALL; fgPhraseCount = FOREACH fgPhraseCount1 GENERATE SUM(fgPhrases1.c); -- STORE fgPhraseCount INTO 'phrases/data/fgPhraseCount'; bgPhraseCount1 = group bgPhrases1 ALL; bgPhraseCount = FOREACH bgPhraseCount1 GENERATE SUM(bgPhrases1.c); -- join in totals - ok to use cross-product here since all but one table are just singletons counts1 = CROSS fgPhraseCount,bgPhraseCount; counts = FOREACH counts1 GENERATE $0 AS fTot,$1 as bTot; phraseStats = CROSS phraseStats6,counts; STORE phraseStats INTO 'phrases/data/phraseStats'; -- phraseStats: {phraseStats6::phraseStats4::phraseStats2::xy: (x: bytearray,y: bytearray),phraseStats6::phraseStats4::phraseStats2::fC: int,phraseStats6::phraseStats4::phraseStats2::bC: int,phraseStats6::phraseStats4::fxC: long,phraseStats6::phraseStats4::bxC: long,phraseStats6::fyC: long,phraseStats6::byC: long,counts::fTot: long,counts::bTot: long} -- phraseStats = LOAD 'phrases/data/phraseStats' AS (xy:(x,y),fC,bC,fxC,bxC,fyC,byC,fTot,bTot); -- final compute phraseness, etc REGISTER ./pkl.jar; phraseResult = FOREACH phraseStats GENERATE *, com.wcohen.SmoothedPKL(fC, fTot, bC, bTot, 1.0/bTot, 1.0) as infoness, com.wcohen.SmoothedPKL(fC, fTot, fxC*fyC, fTot*fTot, 1.0/fxC, 1.0) as phraseness; STORE phraseResult INTO 'phrases/data/phraseResult';