co-clustering-visual-categorization/bagoftopics/universalTopicDictionary.py at master · ashish-code/co-clustering-visual-categorization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
'''
Created on 7 Dec 2011

@author: ag00087
'''

# import libraries
import numpy as np
import argparse
import sys
import os
import time
import shutil

# parse options
parser = OptionParser()
parser.add_option('-d','--dataset',action='store',type='string',dest='dataset',metavar='dataset',help='visual dataset')
parser.add_option('-w','--nCodeword',action='store',type='int',dest='nCodeword',default=1024,metavar='nCodeword',help='number of code words: 32,64,128,256,512,1024,2048,4096,8192,16384')
parser.add_option('-t','--nTopic',action='store',type='int',dest='nTopic',metavar='nTopic',default=128,help='number of topic words: 8,16,32,64,128,256,512,1024')
parser.add_option('-s','--scheme',action='store',type='string',dest='ccType',metavar='ccType',default='i',help='ccType')
parser.add_option('-q','--quiet',action='store_false',dest='verbose',default=True)
parser.add_option('-r','--rowClusters',type='int',metavar='nRowCluster',dest='nRowCluster',default=10,help='number of row clusters')


#configure data paths
rootDir = '/vol/vssp/diplecs/ash/Data/'
imgWrdDir = '/ImgWrdMat/'
ucbDir = '/UniversalCB/'
utdDir = '/UniversalTopicDictionary/'
tempDir = 'Temp'

# global variables
catidfname = 'catidlist.txt'
ucbext = '.ucb'
imgWrdext = '.iwm'
utdext = '.utd'

def getCatMap(dataset):
    catidfpath = rootDir+dataset+'/'+catidfname
    catnames = np.genfromtxt(catidfpath,delimiter=',',dtype='|S32',usecols=[0])
    catnum = np.genfromtxt(catidfpath,delimiter=',',dtype=np.int,usecols=[1])
    catmap = dict(zip(catnames,catnum))
    return catmap

def ccUniversalTopicDictionary():
    #acquire program agruments
    (options, args) = parser.parse_args(sys.argv[1:]) #@UnusedVariable
    dataset = options.dataset
    nRowCluster = options.nRowCluster
    nTopic = options.nTopic
    ccType = options.ccType
    nCodeword = options.nCodeword


    #echo arguments
    if(options.verbose): print options

    #configure data path and other parameters
    dataPath = rootDir+dataset+imgWrdDir
    resultPath = rootDir+dataset+utdDir+dataset
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    dataext = str(nCodeword)+imgWrdext
    resultext = str(nCodeword)+utdext


    # initialise empty iwm matrix and append each category to it
    iwmData = None
    for catName in catList:
        iwmFileName = dataPath+catName+dataext
        iwmCatData = np.loadtxt(iwmFileName,dtype=np.int16,delimiter=' ')
        #stack the category data to the existing data-set data
        if iwmData == None:
            iwmData = iwmCatData
        else:
            iwmData = np.concatenate((iwmData,iwmCatData),axis=0)
        pass

    if(options.verbose): print 'co-clustering...'

    ccData = coclust(iwmData, dataset, nRowCluster, nTopic, ccType)
    # the indices of co-clusters columns
    ccCol = np.array([int(i) for i in ccData[1].split()])

    resultFileName = resultPath+resultext
    np.savetxt(resultFileName, ccCol, fmt='%d', delimiter=' ')

def UniversalTopicDictionary(nCodeword,nTopic,dataset):
    nRowCluster = 10
    ccType = 'i'

    #configure data path and other parameters
    dataPath = rootDir+dataset+imgWrdDir
    resultPath = rootDir+dataset+utdDir+dataset
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    dataext = str(nCodeword)+imgWrdext
    resultext = str(nCodeword)+utdext

    # initialise empty iwm matrix and append each category to it
    iwmData = None
    for catName in catList:
        iwmFileName = dataPath+catName+dataext
        iwmCatData = np.loadtxt(iwmFileName,dtype=np.int16,delimiter=' ')
        #stack the category data to the existing data-set data
        if iwmData == None:
            iwmData = iwmCatData
        else:
            iwmData = np.concatenate((iwmData,iwmCatData),axis=0)
        pass

    ccData = coclust(iwmData, dataset, nRowCluster, nTopic, ccType)
    # the indices of co-clusters columns
    ccCol = np.array([int(i) for i in ccData[1].split()])

    resultFileName = resultPath+resultext
    np.savetxt(resultFileName, ccCol, fmt='%d', delimiter=' ')


def coclust(data,dataset,nRowCluster=1,nColCluster=1,ccType='i'):

    tempTimeDir = str(int(time.time()))
    tempPath = rootDir+tempDir+'/'+tempTimeDir+'/'
    if not os.path.exists(tempPath):
        os.mkdir(tempPath)
    tempdataPath = tempPath+'tempdata'
    tempdataDimPath = tempPath+'tempdata_dim'
    tempCCFilePath = tempPath+'tempCCFile.txt'
    np.savetxt(tempdataPath,data[:,:-1],fmt='%d',delimiter=' ')
    np.savetxt(tempdataDimPath,data[:,:-1].shape,fmt='%d',delimiter=' ')

    cmdPath = '/vol/vssp/diplecs/ash/code/cocluster/'
    cmd = 'cocluster-linux'
    args = ' -A %s -R %d -C %d -I d s %s -O c s 0 o %s' % (ccType,nRowCluster,nColCluster,tempdataPath,tempCCFilePath)
    cwd = os.getcwd()
    os.chdir(cmdPath)
    os.system(cmd + args)
    os.chdir(cwd)

    tempCCFile = open(tempCCFilePath,'r')
    tempFileData = tempCCFile.readlines()
    tempCCFile.close()

    # delete temp data
    shutil.rmtree(tempPath)

    return tempFileData

if __name__ == '__main__':
    ccUniversalTopicDictionary()
    pass