# -*- coding: utf-8 -*-
import collections
import hashlib
import logging
import json
import os
import fcntl
import shutil
import subprocess
import sys
import time
VERSION = "0.1.0"
DEPENDENCY_SUFFIX = ".dep"
MANIFEST_FILENAME = "manifest.txt"
SUCCESS_STAMP = ".success"
FAIL_STAMP = ".fail"
logger = logging.getLogger()
[docs]def waitforsize(pidset, njobs):
"""
Given a set() of pids, wait until it has at most njobs alive children
"""
output = 0
while len(pidset) > njobs:
pid, status = os.wait()
pidset.remove(pid)
output |= status
return output
[docs]def discover_sourcetree(
source_tree, target_tree, exclude_patterns, include_patterns,
progress):
"""
The discovery step performs a filesystem walk in order to build up an index
of files to be checked. You can use configuration files to setup inclusion
and exclusion filters for the discovery process. In general, though, each
directory that is scanned produces a list of files to lint. If the timestamp
of a tracked directory changes, it is rescanned for new files, or new
directories.
The output of the discovery phase is a manifest file per-directory tracked.
The creation of this manifest depends on the modification time of the
directory it corresponds to and will be re-built if the directory is changed.
If a new subdirectory is added, the system will recursively index that new
directory. If a directory is removed, it will recursively purge that
directory from the manifest index.
"""
if not os.path.exists(target_tree):
os.makedirs(target_tree)
ndirs = 1
dir_idx = 0
for source_cwd, dirnames, filenames in os.walk(source_tree):
dir_idx += 1
progress(dir_idx=dir_idx, ndirs=ndirs)
relpath_cwd = os.path.relpath(source_cwd, source_tree)
if relpath_cwd == ".":
# NOTE(josh): os.path.join("", "foo") == "foo"
relpath_cwd = ""
target_cwd = os.path.join(target_tree, relpath_cwd)
if not os.path.exists(target_cwd):
os.makedirs(target_cwd)
# NOTE(josh): is it faster to re-apply filters? or load the result
# from the manifest?
filtered_dirnames = []
for dirname in sorted(dirnames):
if dirname in (".", ".."):
continue
relpath_dir = os.path.join(relpath_cwd, dirname)
if any(pattern.match(relpath_dir) for pattern in exclude_patterns):
continue
filtered_dirnames.append(dirname)
# Only recurse on directories that are tracked
dirnames[:] = filtered_dirnames
ndirs += len(filtered_dirnames)
manifest_path = os.path.join(target_cwd, MANIFEST_FILENAME)
if (os.path.exists(manifest_path) and
os.path.getmtime(manifest_path) > os.path.getmtime(source_cwd)):
# NOTE(josh): this directory has not changed since the last time that
# we scanned it, so we do not need to rewrite the manifest
continue
logger.debug("Scanning: %s", source_cwd)
filtered_filenames = []
for filename in sorted(filenames):
relpath_file = os.path.join(relpath_cwd, filename)
if any(pattern.match(relpath_file) for pattern in exclude_patterns):
continue
if any(pattern.match(relpath_file) for pattern in include_patterns):
filtered_filenames.append(filename)
source_dirset = set(dirnames)
target_dirset = set(dirent.name for dirent in os.scandir(target_cwd)
if dirent.is_dir())
# Directories in the target tree which are not tracked in the source
# tree. We need to remove them
for dirname in target_dirset.difference(source_dirset):
shutil.rmtree(os.path.join(target_cwd, dirname))
with open(manifest_path, "w") as outfile:
for filename in filtered_filenames:
if any(pattern.match(relpath_dir) for pattern in exclude_patterns):
continue
outfile.write(filename)
outfile.write("\n")
progress(dir_idx=ndirs)
[docs]def chunk_iter_file(infile, chunk_size=4096):
"""
Read a file chunk by chunk
"""
chunk = infile.read(chunk_size)
while chunk != b"":
yield chunk
chunk = infile.read(chunk_size)
[docs]def digest_file(source_path, digest_path):
"""
Compute a message digest of the file content, write the digest
(in hexadecimal ascii encoding) to the output file
"""
hasher = hashlib.sha1()
with open(source_path, "rb") as infile:
for chunk in chunk_iter_file(infile):
hasher.update(chunk)
with open(digest_path, "w") as outfile:
outfile.write(hasher.hexdigest())
outfile.write("\n")
[docs]def digest_sourcetree_content(source_tree, target_tree, progress, njobs):
"""
The sha1 of each tracked file is computed and stored in a digest file
(one per source file). The digest file depends on the modification time of
the source file. If the sourcefile hasn't changed, the digest file doesn't
need to be updated.
"""
progress(tool_idx=1, tool="sha1")
pidset = set()
file_idx = 0
nfiles = 0
for target_cwd, dirnames, filenames in os.walk(target_tree):
dirnames[:] = sorted(dirnames) # stable walk
relpath_cwd = os.path.relpath(target_cwd, target_tree)
if relpath_cwd == ".":
# NOTE(josh): os.path.join("", "foo") == "foo"
relpath_cwd = ""
source_cwd = os.path.join(source_tree, relpath_cwd)
manifest_path = os.path.join(target_cwd, MANIFEST_FILENAME)
with open(manifest_path) as infile:
filenames = list(line.strip() for line in infile)
nfiles += len(filenames)
progress(nfiles=nfiles)
for filename in sorted(filenames):
file_idx += 1
progress(file_idx=file_idx)
digest_name = filename + ".sha1"
source_path = os.path.join(source_cwd, filename)
digest_path = os.path.join(target_cwd, digest_name)
if (os.path.exists(digest_path) and
os.path.getmtime(digest_path) > os.path.getmtime(source_path)):
# NOTE(josh): this source file has not changed since the last time
# that we digested it, so we do not need to
continue
logger.debug("Digesting: %s/%s", relpath_cwd, filename)
waitforsize(pidset, njobs - 1)
pid = os.fork()
if pid == 0:
digest_file(source_path, digest_path)
os._exit(0) # pylint: disable=protected-access
pidset.add(pid)
waitforsize(pidset, 0)
# pylint: disable=E1123
if sys.version_info < (2, 6, 0):
DependencyItem = collections.namedtuple(
"DependencyItem", ["path", "digest", "name"])
DependencyItem.__new__.func_defaults = (None,)
if sys.version_info < (3, 7, 0):
DependencyItem = collections.namedtuple(
"DependencyItem", ["path", "digest", "name"])
DependencyItem.__new__.__defaults__ = (None,)
else:
DependencyItem = collections.namedtuple(
"DependencyItem", ["path", "digest", "name"],
defaults=(None,))
[docs]def depmap_is_uptodate(target_tree, relpath_file):
"""
Given a dictionary of dependency data, return true if all of the files
listed are unchanged since we last ran the scan.f
"""
relpath_depmap = relpath_file + DEPENDENCY_SUFFIX
depmap_path = os.path.join(target_tree, relpath_depmap)
if not os.path.exists(depmap_path):
return False
if not os.path.exists(depmap_path + ".sha1"):
return False
depmap_mtime = os.path.getmtime(depmap_path)
if os.path.getmtime(depmap_path + ".sha1") < depmap_mtime:
logger.warning("depmap mtime is later than it's sha1")
return False
with open(depmap_path, "r") as infile:
depmap_data = json.load(infile)
for item in depmap_data:
item = DependencyItem(**item)
if item.path.startswith("/"):
if not os.path.exists(item.path):
logger.debug("%s disappeared", item.path)
return False
# The dependency is an absolute path, which means that it is outside
# the source tree. We don't have a digest cache of this file so if
# it's timestamp indictes it is newer we must act on taht.
if os.path.getmtime(item.path) > depmap_mtime:
logger.debug("%s is newer", item.path)
return False
continue
digest_path = os.path.join(target_tree, item.path + ".sha1")
if not os.path.exists(digest_path):
# Digest file does not exist, but corresponding source file is in our
# source tree... so it must have been excluded during scan
if not os.path.exists(item.path):
logger.debug("%s disappeared", item.path)
return False
if os.path.getmtime(item.path) > depmap_mtime:
logger.debug("%s is newer", item.path)
return False
continue
if os.path.getmtime(digest_path) < depmap_mtime:
# The dependency map is newer than this particular file, so this
# file does not invalidate it
continue
with open(digest_path) as infile:
digest = infile.read().strip()
if digest == item.digest:
# The timestamp on this file is newer than the digest, but the file
# content is unchanged, so thsi file does not invalidate it
continue
# The timestamp is newer and it's content has changed. The dependency
# map is out of date.
return False
return True
[docs]def map_dependencies(source_tree, target_tree, source_relpath):
"""
Get a dependency list from the sourcefile. Writeout the dependency file
and it's sha1 digest.
"""
targetpath = os.path.join(target_tree, source_relpath) + DEPENDENCY_SUFFIX
with open(targetpath, "w") as outfile:
subprocess.check_call(
[sys.executable, "-Bm", "makelint.get_dependencies",
"--module-relpath", source_relpath,
"--source-tree", source_tree,
"--target-tree", target_tree
], stdout=outfile, stderr=subprocess.DEVNULL)
digest_file(targetpath, targetpath + ".sha1")
[docs]def map_sourcetree_dependencies(source_tree, target_tree, progress, njobs):
"""
During this phase each tracked
source file is indexed to get a complete dependency footprint. Note that this
is done by importing each module file in a clean interpreter process, and
then inspecting the `__file__` attribute of all modules loaded by interpreter.
"""
progress(tool_idx=2, tool="depmap")
pidset = set()
file_idx = 0
for target_cwd, dirnames, filenames in os.walk(target_tree):
dirnames[:] = sorted(dirnames) # stable walk
relpath_cwd = os.path.relpath(target_cwd, target_tree)
if relpath_cwd == ".":
# NOTE(josh): os.path.join("", "foo") == "foo"
relpath_cwd = ""
manifest_path = os.path.join(target_cwd, MANIFEST_FILENAME)
with open(manifest_path) as infile:
filenames = list(line.strip() for line in infile)
for filename in sorted(filenames):
file_idx += 1
progress(file_idx=file_idx)
relpath_file = os.path.join(relpath_cwd, filename)
if not depmap_is_uptodate(target_tree, relpath_file):
logger.debug("Mapping dependencies: %s", relpath_file)
waitforsize(pidset, njobs - 1)
pid = os.fork()
if pid == 0:
map_dependencies(source_tree, target_tree, relpath_file)
os._exit(0) # pylint: disable=protected-access
pidset.add(pid)
waitforsize(pidset, 0)
[docs]def cat_log(logfile_path, header, merged_log):
"""
Copy the content from logfile_path into merged_log
"""
if not merged_log:
return
merged_log.write(header)
merged_log.write("\n")
merged_log.write("=" * len(header))
merged_log.write("\n")
with open(logfile_path) as infile:
for line in infile:
merged_log.write(line)
merged_log.write("\n\n")
[docs]def get_progress_bar(numchars, fraction=None, percent=None):
"""
Return a high resolution unicode progress bar
"""
if percent is not None:
fraction = percent / 100.0
if fraction >= 1.0:
return "█" * numchars
blocks = [" ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█"]
length_in_chars = fraction * numchars
n_full = int(length_in_chars)
i_partial = int(8 * (length_in_chars - n_full))
n_empty = max(numchars - n_full - 1, 0)
return ("█" * n_full) + blocks[i_partial] + (" " * n_empty)
[docs]class ProgressReporter(object):
"""
Prints a status message
"""
def __init__(self):
self.ndirs = 0
self.nfiles = 0
self.ntools = 0
self.dir_idx = 0
self.tool_idx = 0
self.file_idx = 0
self.tool = ""
self.lastprint = 0
self.toolnames = [""] * 10
def __call__(self, **kwargs):
rewind = kwargs.pop("rewind", True)
force = kwargs.pop("force", False)
if "tool_idx" in kwargs and "tool" in kwargs:
self.toolnames[kwargs["tool_idx"]] = kwargs["tool"]
for key, value in kwargs.items():
setattr(self, key, value)
if time.time() - self.lastprint < 0.1 and not force:
return
self.lastprint = time.time()
nlines = self.do_print()
if rewind:
# Move back up three lines
sys.stdout.write("\x1b[{}F".format(nlines))
sys.stdout.flush()
[docs] def get_nsteps(self):
"""
Return the total number of steps to completion
"""
return self.nfiles + (self.ntools * self.nfiles)
[docs] def get_istep(self):
"""
Return the index of our current step
"""
return (self.tool_idx * self.nfiles) + self.file_idx
[docs] def get_progress(self):
"""
Return current progress as a percentage
"""
if self.get_nsteps() == 0:
return 0
return 100.0 * self.get_istep() / self.get_nsteps()
[docs] def do_print(self):
nlines = 0
sys.stdout.write(
"{:>10s}: {:5d}/{:<5d} [{}] {:6.2f}%"
.format("Total", self.get_istep(), self.get_nsteps(),
get_progress_bar(20, percent=self.get_progress()),
self.get_progress())
)
sys.stdout.write("\x1b[0K\n") # clear the rest of the line
nlines += 1
progress = 0.0
if self.ndirs > 0:
progress = 100.0 * self.dir_idx / self.ndirs
sys.stdout.write(
"{:>10s}: {:5d}/{:<5d} [{}] {:6.2f}%"
.format("Indexing", self.dir_idx, self.ndirs,
get_progress_bar(20, percent=progress),
progress))
sys.stdout.write("\x1b[0K\n") # clear the rest of the line
nlines += 1
for idx in range(1, self.tool_idx):
sys.stdout.write(
"{0:>10s}: {1:5d}/{1:<5d} [{2}] {3:6.2f}%"
.format(self.toolnames[idx], self.nfiles,
get_progress_bar(20, 1.0), 100.0))
sys.stdout.write("\x1b[0K\n")
nlines += 1
if self.tool_idx > 0:
progress = 0.0
if self.nfiles > 0:
progress = 100.0 * self.file_idx / self.nfiles
sys.stdout.write(
"{:>10s}: {:5d}/{:<5d} [{}] {:6.2f}%"
.format(self.tool, self.file_idx, self.nfiles,
get_progress_bar(20, percent=progress),
progress))
sys.stdout.write("\x1b[0K\n") # clear the rest of the line
nlines += 1
return nlines
[docs]class NullProgressReport(object):
"""
No-op for quiet mode
"""
def __call__(self, *kwargs):
pass