testing/mozharness/scripts/release/antivirus.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

from multiprocessing.pool import ThreadPool
import os
import re
import sys
import shutil

sys.path.insert(1, os.path.dirname(os.path.dirname(sys.path[0])))

from mozharness.base.python import VirtualenvMixin, virtualenv_config_options
from mozharness.base.script import BaseScript


class AntivirusScan(BaseScript, VirtualenvMixin):
    config_options = [
        [["--product"], {
            "dest": "product",
            "help": "Product being released, eg: firefox, thunderbird",
        }],
        [["--version"], {
            "dest": "version",
            "help": "Version of release, eg: 39.0b5",
        }],
        [["--build-number"], {
            "dest": "build_number",
            "help": "Build number of release, eg: 2",
        }],
        [["--bucket-name"], {
            "dest": "bucket_name",
            "help": "S3 Bucket to retrieve files from",
        }],
        [["--exclude"], {
            "dest": "excludes",
            "action": "append",
            "help": "List of filename patterns to exclude. See script source for default",
        }],
        [["-d", "--download-parallelization"], {
            "dest": "download_parallelization",
            "default": 6,
            "type": "int",
            "help": "Number of concurrent file downloads",
        }],
        [["-s", "--scan-parallelization"], {
            "dest": "scan_parallelization",
            "default": 4,
            "type": "int",
            "help": "Number of concurrent file scans",
        }],
        [["--tools-repo"], {
            "dest": "tools_repo",
            "default": "https://hg.mozilla.org/build/tools",
        }],
        [["--tools-revision"], {
            "dest": "tools_revision",
            "help": "Revision of tools repo to use when downloading extract_and_run_command.py",
        }],
        ] + virtualenv_config_options

    DEFAULT_EXCLUDES = [
        r"^.*tests.*$",
        r"^.*crashreporter.*$",
        r"^.*\.zip(\.asc)?$",
        r"^.*\.log$",
        r"^.*\.txt$",
        r"^.*\.asc$",
        r"^.*/partner-repacks.*$",
        r"^.*.checksums(\.asc)?$",
        r"^.*/logs/.*$",
        r"^.*/jsshell.*$",
        r"^.*json$",
        r"^.*/host.*$",
        r"^.*/mar-tools/.*$",
        r"^.*robocop.apk$",
        r"^.*contrib.*"
    ]
    CACHE_DIR = 'cache'

    def __init__(self):
        BaseScript.__init__(self,
            config_options=self.config_options,
            require_config_file=False,
            config={
                "virtualenv_modules": [
                    "boto",
                    "redo",
                    "mar",
                ],
                "virtualenv_path": "venv",
            },
            all_actions=[
                "create-virtualenv",
                "activate-virtualenv",
                "get-extract-script",
                "get-files",
                "scan-files",
                "cleanup-cache",
            ],
            default_actions=[
                "create-virtualenv",
                "activate-virtualenv",
                "get-extract-script",
                "get-files",
                "scan-files",
                "cleanup-cache",
            ],
        )
        self.excludes = self.config.get('excludes', self.DEFAULT_EXCLUDES)
        self.dest_dir = self.CACHE_DIR

    def _get_candidates_prefix(self):
        return "pub/{}/candidates/{}-candidates/build{}/".format(
            self.config['product'],
            self.config["version"],
            self.config["build_number"]
        )

    def _matches_exclude(self, keyname):
        for exclude in self.excludes:
            if re.search(exclude, keyname):
                return True
        return False

    def get_extract_script(self):
        """Gets a copy of extract_and_run_command.py from tools, and the supporting mar.py,
        so that we can unpack various files for clam to scan them."""
        remote_file = "{}/raw-file/{}/stage/extract_and_run_command.py".format(self.config["tools_repo"],
                                                                               self.config["tools_revision"])
        self.download_file(remote_file, file_name="extract_and_run_command.py")

    def get_files(self):
        """Pull the candidate files down from S3 for scanning, using parallel requests"""
        from boto.s3.connection import S3Connection
        from boto.exception import S3CopyError, S3ResponseError
        from redo import retry
        from httplib import HTTPException

        # suppress boto debug logging, it's too verbose with --loglevel=debug
        import logging
        logging.getLogger('boto').setLevel(logging.INFO)

        self.info("Connecting to S3")
        conn = S3Connection(anon=True)
        self.info("Getting bucket {}".format(self.config["bucket_name"]))
        bucket = conn.get_bucket(self.config["bucket_name"])

        if os.path.exists(self.dest_dir):
            self.info('Emptying {}'.format(self.dest_dir))
            shutil.rmtree(self.dest_dir)
        os.makedirs(self.dest_dir)

        def worker(item):
            source, destination = item

            self.info("Downloading {} to {}".format(source, destination))
            key = bucket.get_key(source)
            return retry(key.get_contents_to_filename,
                         args=(destination, ),
                         sleeptime=30, max_sleeptime=150,
                         retry_exceptions=(S3CopyError, S3ResponseError,
                                           IOError, HTTPException))

        def find_release_files():
            candidates_prefix = self._get_candidates_prefix()
            self.info("Getting key names from candidates")
            for key in bucket.list(prefix=candidates_prefix):
                keyname = key.name
                if self._matches_exclude(keyname):
                    self.debug("Excluding {}".format(keyname))
                else:
                    destination = os.path.join(self.dest_dir, keyname.replace(candidates_prefix, ''))
                    dest_dir = os.path.dirname(destination)
                    if not os.path.isdir(dest_dir):
                        os.makedirs(dest_dir)
                    yield (keyname, destination)

        pool = ThreadPool(self.config["download_parallelization"])
        pool.map(worker, find_release_files())

    def scan_files(self):
        """Scan the files we've collected. We do the download and scan concurrently to make
        it easier to have a coherent log afterwards. Uses the venv python."""
        self.run_command([self.query_python_path(), 'extract_and_run_command.py',
                          '-j{}'.format(self.config['scan_parallelization']),
                          'clamdscan', '-m', '--no-summary', '--', self.dest_dir])

    def cleanup_cache(self):
        """If we have simultaneous releases in flight an av slave may end up doing another
        av job before being recycled, and we need to make sure the full disk is available."""
        shutil.rmtree(self.dest_dir)


if __name__ == "__main__":
    myScript = AntivirusScan()
    myScript.run_and_exit()