-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathfacecrawler.py
More file actions
126 lines (96 loc) · 2.92 KB
/
facecrawler.py
File metadata and controls
126 lines (96 loc) · 2.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import sys
import time
import os
import re
import simplejson
import urllib
import urllib2
import httplib
import json
import logging
from pymongo import *
MAX_LIMIT=50
SLEEP_TIME=2
DB_NAME="facebook"
#-----------------------------------------------------------------------------
def userExists(db, idUser):
try:
count=db.users.find({"id":str(idUser)}).count()
return count
except Exception:
import traceback
logger.error('generic exception: ' + traceback.format_exc())
#-----------------------------------------------------------------------------
def insertUser(db, idUser, response):
try:
d=json.loads(response)
db.users.update({"id":str(idUser)}, {"$set":d}, upsert=True)
except Exception:
import traceback
logger.error('generic exception: ' + traceback.format_exc())
logger.error("user "+str(iduser)+" not processed")
##############################################################################
##############################################################################
# set up logging to file
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%Y-%m-%d %H:%M',
filename='facebook_LOG.log',
filemode='w')
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler()
console.setLevel(logging.INFO)
# set a format which is simpler for console use
formatter = logging.Formatter('%(asctime)s %(name)-12s: %(levelname)-8s %(message)s', '%Y-%m-%d %H:%M')
# tell the handler to use this format
console.setFormatter(formatter)
# add the handler to the root logger
logging.getLogger('').addHandler(console)
logger = logging.getLogger("main")
url="http://graph.facebook.com"
#connecto to db
try:
con=Connection()
db=con[DB_NAME]
except:
import traceback
logger.error('generic exception: ' + traceback.format_exc())
logger.error("Error connecting to DB")
sys.exit(1)
for i in range(4, MAX_LIMIT):
urlP=url+"/"+str(i)
req = urllib2.Request(urlP)
try:
exists=userExists(db, i)
if exists > 0:
print "user "+str(i)+" exists!"
continue
response=urllib2.urlopen(req)
jsonResponse=response.read()
print jsonResponse
if not jsonResponse:
logger.error("user "+i+" not processed")
else:
insertUser(db, i, jsonResponse)
time.sleep(SLEEP_TIME)
except urllib2.HTTPError, e:
logger.error('HTTPError = ' + str(e.code))
logger.error("user "+str(i)+" not processed")
time.sleep(SLEEP_TIME)
continue
except urllib2.URLError, e:
logger.error('URLError = ' + str(e.reason))
logger.error("user "+str(i)+" not processed")
time.sleep(SLEEP_TIME)
continue
except httplib.HTTPException, e:
logger.error('HTTPException')
logger.error("user "+str(i)+" not processed")
time.sleep(SLEEP_TIME)
continue
except Exception:
import traceback
logger.error('generic exception: ' + traceback.format_exc())
logger.error("user "+str(i)+" not processed")
time.sleep(SLEEP_TIME)
continue