1 files changed, 193 insertions, 0 deletions
diff --git a/testing/mozharness/scripts/release/antivirus.py b/testing/mozharness/scripts/release/antivirus.py
new file mode 100644
index 000000000..b40dc5cc0
--- /dev/null
+++ b/testing/mozharness/scripts/release/antivirus.py
@@ -0,0 +1,193 @@
+from multiprocessing.pool import ThreadPool
+import os
+import re
+import sys
+import shutil
+
+sys.path.insert(1, os.path.dirname(os.path.dirname(sys.path[0])))
+
+from mozharness.base.python import VirtualenvMixin, virtualenv_config_options
+from mozharness.base.script import BaseScript
+
+
+class AntivirusScan(BaseScript, VirtualenvMixin):
+    config_options = [
+        [["--product"], {
+            "dest": "product",
+            "help": "Product being released, eg: firefox, thunderbird",
+        }],
+        [["--version"], {
+            "dest": "version",
+            "help": "Version of release, eg: 39.0b5",
+        }],
+        [["--build-number"], {
+            "dest": "build_number",
+            "help": "Build number of release, eg: 2",
+        }],
+        [["--bucket-name"], {
+            "dest": "bucket_name",
+            "help": "S3 Bucket to retrieve files from",
+        }],
+        [["--exclude"], {
+            "dest": "excludes",
+            "action": "append",
+            "help": "List of filename patterns to exclude. See script source for default",
+        }],
+        [["-d", "--download-parallelization"], {
+            "dest": "download_parallelization",
+            "default": 6,
+            "type": "int",
+            "help": "Number of concurrent file downloads",
+        }],
+        [["-s", "--scan-parallelization"], {
+            "dest": "scan_parallelization",
+            "default": 4,
+            "type": "int",
+            "help": "Number of concurrent file scans",
+        }],
+        [["--tools-repo"], {
+            "dest": "tools_repo",
+            "default": "https://hg.mozilla.org/build/tools",
+        }],
+        [["--tools-revision"], {
+            "dest": "tools_revision",
+            "help": "Revision of tools repo to use when downloading extract_and_run_command.py",
+        }],
+        ] + virtualenv_config_options
+
+    DEFAULT_EXCLUDES = [
+        r"^.*tests.*$",
+        r"^.*crashreporter.*$",
+        r"^.*\.zip(\.asc)?$",
+        r"^.*\.log$",
+        r"^.*\.txt$",
+        r"^.*\.asc$",
+        r"^.*/partner-repacks.*$",
+        r"^.*.checksums(\.asc)?$",
+        r"^.*/logs/.*$",
+        r"^.*/jsshell.*$",
+        r"^.*json$",
+        r"^.*/host.*$",
+        r"^.*/mar-tools/.*$",
+        r"^.*robocop.apk$",
+        r"^.*contrib.*"
+    ]
+    CACHE_DIR = 'cache'
+
+    def __init__(self):
+        BaseScript.__init__(self,
+            config_options=self.config_options,
+            require_config_file=False,
+            config={
+                "virtualenv_modules": [
+                    "boto",
+                    "redo",
+                    "mar",
+                ],
+                "virtualenv_path": "venv",
+            },
+            all_actions=[
+                "create-virtualenv",
+                "activate-virtualenv",
+                "get-extract-script",
+                "get-files",
+                "scan-files",
+                "cleanup-cache",
+            ],
+            default_actions=[
+                "create-virtualenv",
+                "activate-virtualenv",
+                "get-extract-script",
+                "get-files",
+                "scan-files",
+                "cleanup-cache",
+            ],
+        )
+        self.excludes = self.config.get('excludes', self.DEFAULT_EXCLUDES)
+        self.dest_dir = self.CACHE_DIR
+
+    def _get_candidates_prefix(self):
+        return "pub/{}/candidates/{}-candidates/build{}/".format(
+            self.config['product'],
+            self.config["version"],
+            self.config["build_number"]
+        )
+
+    def _matches_exclude(self, keyname):
+        for exclude in self.excludes:
+            if re.search(exclude, keyname):
+                return True
+        return False
+
+    def get_extract_script(self):
+        """Gets a copy of extract_and_run_command.py from tools, and the supporting mar.py,
+        so that we can unpack various files for clam to scan them."""
+        remote_file = "{}/raw-file/{}/stage/extract_and_run_command.py".format(self.config["tools_repo"],
+                                                                               self.config["tools_revision"])
+        self.download_file(remote_file, file_name="extract_and_run_command.py")
+
+    def get_files(self):
+        """Pull the candidate files down from S3 for scanning, using parallel requests"""
+        from boto.s3.connection import S3Connection
+        from boto.exception import S3CopyError, S3ResponseError
+        from redo import retry
+        from httplib import HTTPException
+
+        # suppress boto debug logging, it's too verbose with --loglevel=debug
+        import logging
+        logging.getLogger('boto').setLevel(logging.INFO)
+
+        self.info("Connecting to S3")
+        conn = S3Connection(anon=True)
+        self.info("Getting bucket {}".format(self.config["bucket_name"]))
+        bucket = conn.get_bucket(self.config["bucket_name"])
+
+        if os.path.exists(self.dest_dir):
+            self.info('Emptying {}'.format(self.dest_dir))
+            shutil.rmtree(self.dest_dir)
+        os.makedirs(self.dest_dir)
+
+        def worker(item):
+            source, destination = item
+
+            self.info("Downloading {} to {}".format(source, destination))
+            key = bucket.get_key(source)
+            return retry(key.get_contents_to_filename,
+                         args=(destination, ),
+                         sleeptime=30, max_sleeptime=150,
+                         retry_exceptions=(S3CopyError, S3ResponseError,
+                                           IOError, HTTPException))
+
+        def find_release_files():
+            candidates_prefix = self._get_candidates_prefix()
+            self.info("Getting key names from candidates")
+            for key in bucket.list(prefix=candidates_prefix):
+                keyname = key.name
+                if self._matches_exclude(keyname):
+                    self.debug("Excluding {}".format(keyname))
+                else:
+                    destination = os.path.join(self.dest_dir, keyname.replace(candidates_prefix, ''))
+                    dest_dir = os.path.dirname(destination)
+                    if not os.path.isdir(dest_dir):
+                        os.makedirs(dest_dir)
+                    yield (keyname, destination)
+
+        pool = ThreadPool(self.config["download_parallelization"])
+        pool.map(worker, find_release_files())
+
+    def scan_files(self):
+        """Scan the files we've collected. We do the download and scan concurrently to make
+        it easier to have a coherent log afterwards. Uses the venv python."""
+        self.run_command([self.query_python_path(), 'extract_and_run_command.py',
+                          '-j{}'.format(self.config['scan_parallelization']),
+                          'clamdscan', '-m', '--no-summary', '--', self.dest_dir])
+
+    def cleanup_cache(self):
+        """If we have simultaneous releases in flight an av slave may end up doing another
+        av job before being recycled, and we need to make sure the full disk is available."""
+        shutil.rmtree(self.dest_dir)
+
+
+if __name__ == "__main__":
+    myScript = AntivirusScan()
+    myScript.run_and_exit()