-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdo_r_text.py
97 lines (91 loc) · 3.5 KB
/
do_r_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os,sys, json,imp
import romkan
from collections import defaultdict
from pythonlib_ys import main as myModule
imp.reload(myModule)
def get_hom_in_file(FSr,JsonFP,UpTo=100000,FstPosition=0,AssumeSortedP=False):
#print('trying to find homs for '+TgtHom)
# Fnd=False;FndCnt=0
OrthsVecs=defaultdict(list)
# TgtHomRegex='["'+TgtHom
FSr,Chunk,LineCnt,_=myModule.pop_chunk_from_stream(FSr,Pattern=',',Type='cont')
if not Chunk or not FSr:
return None
MultiToks=[]
for Line in Chunk.strip().split('\n'):
HomVecs=json.loads(Line)
PronCat,Ind,Len,Toks,Vec=HomVecs
Orth=''.join(Toks[Ind:Ind+Len])
if Len>=2:
MultiToks.append(Orth)
OrthsVecs[Orth].append(Vec)
# print(str(Cntr+1)+' found')
return FSr,OrthsVecs,PronCat,LineCnt,MultiToks
def output_text_per_hom(OutJsonFP,Max=3000):
def is_of_interest(Orths):
#Bool=True
KanjiOnly=False
# at least 2 variations in orth
if len(Orths)<2:
return False,None
# at least two is kanji orths
else:
KanjiOrths=[Orth for Orth in Orths if myModule.at_least_one_of_chartypes_p(Orth,['han'])]
if len(KanjiOrths)<2:
return False,None
elif len(KanjiOrths)==len(Orths):
KanjiOnly=True
return True, KanjiOnly
LCnt=7900#myModule.get_linecount(OutJsonFP)
TxtDir=os.path.join(os.path.dirname(OutJsonFP),os.path.basename(OutJsonFP)+'_txt')
if not os.path.isdir(TxtDir):
os.mkdir(TxtDir)
CntSoFar=0;Cntr=0
CntThresh=LCnt/1000
with open(OutJsonFP) as FSr:
#print('retrieving homvecs for '+Hom+'...')
while FSr or Cntr<Max:
Ret=get_hom_in_file(FSr,OutJsonFP,FstPosition=CntSoFar)
if Ret:
FSr,OrthsVecs,Hom,Cnt,MultiToks=Ret
else:
break
#except:
# get_hom_in_file(FSr,OutJsonFP,FstPosition=CntSoFar)
Orths=list(OrthsVecs.keys())
print('For '+Hom+', we found the following orths, '+str(Cnt)+' items')
print(Orths);print(CntThresh)
IsOfInt,KanjiOnly=is_of_interest(Orths)
if not(Cnt>CntThresh and len(Hom.split(':')[0])>=2 and IsOfInt):
print('not selected for printing\n')
else:
print('writing out...')
RomHom=romkan.to_roma(Hom)
OutHomFP=os.path.join(TxtDir,'homvecs_'+RomHom)
with open(OutHomFP,'wt') as FSw:
FSw.write(stringify_hom_vecs(OrthsVecs))
print('... done, fp: '+OutHomFP)
if KanjiOnly:
RefClusterFP=OutHomFP+'.refclusters'
with open(RefClusterFP,'wt') as FSw:
FSw.write('\t'.join(get_cluster_ref(OrthsVecs)))
CntSoFar+=Cnt;Cntr+=1
def get_cluster_ref(OrthsVecs):
Refs=[]
for Ind,Orth in enumerate(OrthsVecs.keys()):
Refs.extend(list(str(Ind+1)*len(OrthsVecs[Orth])))
return Refs
def stringify_hom_vecs(OrthsVecs,UpToPerOrth=1000):
Str=''
for OrthCntr,(Orth,Vecs) in enumerate(OrthsVecs.items()):
for VecCntr,Vec in enumerate(Vecs):
if VecCntr>=UpToPerOrth:
break
Line=Orth+str(OrthCntr)+'_'+str(VecCntr)+'\t'+'\t'.join([str(Num) for Num in Vec])
Str+=Line+'\n'
return Str
def main(FP):
output_text_per_hom(FP)
if __name__=='__main__':
FP=sys.argv[1]
main(FP)