-
Notifications
You must be signed in to change notification settings - Fork 0
/
features.py
151 lines (136 loc) · 5.34 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from __future__ import division
import htmlDiff
import sys
import string
import pdb
from GSB.gsb import getDomainStatus
from pprint import *
def getCSVHeaders():
headers = []
headers.append('elementType')
headers.append('editType')
headers.append('scriptLength')
headers.append('specialCharRatio')
headers.append('GSB')
headers.append('jsEval')
return headers
elementTypesList = ['a', 'abbr', 'acronym', 'address', 'applet', 'area', 'article', 'aside', 'audio', 'b', 'base', 'basefont', 'bdi', 'bdo', 'bgsound', 'big', 'blink', 'blockquote', 'body', 'br', 'button', 'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'command', 'content', 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'dir', 'div', 'dl', 'dt', 'element', 'em', 'embed', 'fieldset', 'figcaption', 'figure', 'font', 'footer', 'form', 'frame', 'frameset', 'head', 'header', 'hgroup', 'hr', 'html', 'i', 'iframe', 'image', 'img', 'input', 'ins', 'isindex', 'kbd', 'keygen', 'label', 'legend', 'li', 'link', 'listing', 'main', 'map', 'mark', 'marquee', 'menu', 'menuitem', 'meta', 'meter', 'multicol', 'nav', 'nobr', 'noembed', 'noframes', 'noscript', 'object', 'ol', 'optgroup', 'option', 'output', 'p', 'param', 'picture', 'plaintext', 'pre', 'progress', 'q', 'rp', 'rt', 'rtc', 'ruby', 's', 'samp', 'script', 'section', 'select', 'shadow', 'small', 'source', 'spacer', 'span', 'strike', 'strong', 'style', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'title', 'tr', 'track', 'tt', 'u', 'ul', 'var', 'video', 'wbr', 'xmp']
elementTypes = {}
def createElementTypesDict():
global elementTypes
for i, elementType in enumerate(elementTypesList):
elementTypes[elementType] = i
# Map each element type to a numerical value for the feature vector.
def getElementID(diff):
global elementTypes
elementType = diff['elementType']
# Remove _1 from script_1 and stuff into the script bucket
try:
return elementTypes[elementType.split('_')[0]]
except KeyError:
elementTypesList.append(elementType.split('_')[0])
elementTypes[elementType.split('_')[0]] = len(elementTypesList)
return elementTypes[elementType.split('_')[0]]
except:
diff['elementType'] = 'builtin_function_or_method'
return -1
def printElementTypes():
pprint(sorted(elementTypes.keys()))
changeTypes = {}
# Map each change types to a numerical value for the feature vector.
def getChangeID(otherInfo):
try:
return changeTypes[otherInfo]
except:
changeTypes[otherInfo] = len(changeTypes.keys())
return changeTypes[otherInfo]
# Calculate the length of the script that is to be run.
def scriptLen(change):
try:
if 'script' in change['elementType']:
try:
return len(change['afterText'])
except:
return 0
else:
return -1
except:
return -1
# Calculate the number of special characters the change has.
def specialCharRatio(change):
specialCount = 0
normalCount = 0
try:
for char in string.punctuation:
specialCount += change['afterText'].count(char)
normalCount = len(change['afterText']) - specialCount
return specialCount / normalCount
except ZeroDivisionError:
return -1
except AttributeError:
return -1
def getGSB(change):
try:
url = change['afterAttribute']['src']
response=getDomainStatus(url)
if response == 'malware':
return 1
elif response == 'phishing':
return 2
elif response == 'unwanted':
return 3
else:
return 0
except KeyError:
return -1
except TypeError:
return -1
def jsEval(change):
try:
if change['elementType'] == 'script':
counts = change['afterText'].count('eval')
counts += change['afterText'].count('document.write')
counts += change['afterText'].count('document.createElement')
#print 'eval counts {}'.format(counts)
return counts
return -1
except AttributeError:
return -1
# Return a list of feature vectors for the files.
def getFeatures(diff):
# Calculate all of the changes in a page.
createElementTypesDict()
features = []
# Open a CSV file and write each set of calculated features to a line.
for key in sorted(diff.keys()):
change = diff[key]
f1 = getElementID(change)
f2 = getChangeID(change['otherInfo'])
f3 = scriptLen(change)
f4 = specialCharRatio(change)
f5 = getGSB(change)
f6 = jsEval(change)
featureRow = []
featureRow.append(f1)
featureRow.append(f2)
featureRow.append(f3)
featureRow.append(f4)
featureRow.append(f5)
featureRow.append(f6)
features.append(featureRow)
return features
if __name__ == "__main__":
#file1 = sys.argv[1]
#file2 = sys.argv[2]
file1 = 'ground_truth/malware_traffic_analysis/2015_10_20_before'
file2 = 'ground_truth/malware_traffic_analysis/2015_10_20_after'
diff = htmlDiff.htmldiff(file1, file2)
try:
outputDict['afterText'].rstrip()
except:
pass
try:
outputDict['rawChange'].rstrip()
except:
pass
features = getFeatures(diff)