Source code for makelint

# -*- coding: utf-8 -*-
import collections
import hashlib
import logging
import json
import os
import fcntl
import shutil
import subprocess
import sys
import time

VERSION = "0.1.0"
DEPENDENCY_SUFFIX = ".dep"
MANIFEST_FILENAME = "manifest.txt"
SUCCESS_STAMP = ".success"
FAIL_STAMP = ".fail"

logger = logging.getLogger()


[docs]def waitforsize(pidset, njobs): """ Given a set() of pids, wait until it has at most njobs alive children """ output = 0 while len(pidset) > njobs: pid, status = os.wait() pidset.remove(pid) output |= status return output
[docs]def discover_sourcetree( source_tree, target_tree, exclude_patterns, include_patterns, progress): """ The discovery step performs a filesystem walk in order to build up an index of files to be checked. You can use configuration files to setup inclusion and exclusion filters for the discovery process. In general, though, each directory that is scanned produces a list of files to lint. If the timestamp of a tracked directory changes, it is rescanned for new files, or new directories. The output of the discovery phase is a manifest file per-directory tracked. The creation of this manifest depends on the modification time of the directory it corresponds to and will be re-built if the directory is changed. If a new subdirectory is added, the system will recursively index that new directory. If a directory is removed, it will recursively purge that directory from the manifest index. """ if not os.path.exists(target_tree): os.makedirs(target_tree) ndirs = 1 dir_idx = 0 for source_cwd, dirnames, filenames in os.walk(source_tree): dir_idx += 1 progress(dir_idx=dir_idx, ndirs=ndirs) relpath_cwd = os.path.relpath(source_cwd, source_tree) if relpath_cwd == ".": # NOTE(josh): os.path.join("", "foo") == "foo" relpath_cwd = "" target_cwd = os.path.join(target_tree, relpath_cwd) if not os.path.exists(target_cwd): os.makedirs(target_cwd) # NOTE(josh): is it faster to re-apply filters? or load the result # from the manifest? filtered_dirnames = [] for dirname in sorted(dirnames): if dirname in (".", ".."): continue relpath_dir = os.path.join(relpath_cwd, dirname) if any(pattern.match(relpath_dir) for pattern in exclude_patterns): continue filtered_dirnames.append(dirname) # Only recurse on directories that are tracked dirnames[:] = filtered_dirnames ndirs += len(filtered_dirnames) manifest_path = os.path.join(target_cwd, MANIFEST_FILENAME) if (os.path.exists(manifest_path) and os.path.getmtime(manifest_path) > os.path.getmtime(source_cwd)): # NOTE(josh): this directory has not changed since the last time that # we scanned it, so we do not need to rewrite the manifest continue logger.debug("Scanning: %s", source_cwd) filtered_filenames = [] for filename in sorted(filenames): relpath_file = os.path.join(relpath_cwd, filename) if any(pattern.match(relpath_file) for pattern in exclude_patterns): continue if any(pattern.match(relpath_file) for pattern in include_patterns): filtered_filenames.append(filename) source_dirset = set(dirnames) target_dirset = set(dirent.name for dirent in os.scandir(target_cwd) if dirent.is_dir()) # Directories in the target tree which are not tracked in the source # tree. We need to remove them for dirname in target_dirset.difference(source_dirset): shutil.rmtree(os.path.join(target_cwd, dirname)) with open(manifest_path, "w") as outfile: for filename in filtered_filenames: if any(pattern.match(relpath_dir) for pattern in exclude_patterns): continue outfile.write(filename) outfile.write("\n") progress(dir_idx=ndirs)
[docs]def chunk_iter_file(infile, chunk_size=4096): """ Read a file chunk by chunk """ chunk = infile.read(chunk_size) while chunk != b"": yield chunk chunk = infile.read(chunk_size)
[docs]def digest_file(source_path, digest_path): """ Compute a message digest of the file content, write the digest (in hexadecimal ascii encoding) to the output file """ hasher = hashlib.sha1() with open(source_path, "rb") as infile: for chunk in chunk_iter_file(infile): hasher.update(chunk) with open(digest_path, "w") as outfile: outfile.write(hasher.hexdigest()) outfile.write("\n")
[docs]def digest_sourcetree_content(source_tree, target_tree, progress, njobs): """ The sha1 of each tracked file is computed and stored in a digest file (one per source file). The digest file depends on the modification time of the source file. If the sourcefile hasn't changed, the digest file doesn't need to be updated. """ progress(tool_idx=1, tool="sha1") pidset = set() file_idx = 0 nfiles = 0 for target_cwd, dirnames, filenames in os.walk(target_tree): dirnames[:] = sorted(dirnames) # stable walk relpath_cwd = os.path.relpath(target_cwd, target_tree) if relpath_cwd == ".": # NOTE(josh): os.path.join("", "foo") == "foo" relpath_cwd = "" source_cwd = os.path.join(source_tree, relpath_cwd) manifest_path = os.path.join(target_cwd, MANIFEST_FILENAME) with open(manifest_path) as infile: filenames = list(line.strip() for line in infile) nfiles += len(filenames) progress(nfiles=nfiles) for filename in sorted(filenames): file_idx += 1 progress(file_idx=file_idx) digest_name = filename + ".sha1" source_path = os.path.join(source_cwd, filename) digest_path = os.path.join(target_cwd, digest_name) if (os.path.exists(digest_path) and os.path.getmtime(digest_path) > os.path.getmtime(source_path)): # NOTE(josh): this source file has not changed since the last time # that we digested it, so we do not need to continue logger.debug("Digesting: %s/%s", relpath_cwd, filename) waitforsize(pidset, njobs - 1) pid = os.fork() if pid == 0: digest_file(source_path, digest_path) os._exit(0) # pylint: disable=protected-access pidset.add(pid) waitforsize(pidset, 0)
# pylint: disable=E1123 if sys.version_info < (2, 6, 0): DependencyItem = collections.namedtuple( "DependencyItem", ["path", "digest", "name"]) DependencyItem.__new__.func_defaults = (None,) if sys.version_info < (3, 7, 0): DependencyItem = collections.namedtuple( "DependencyItem", ["path", "digest", "name"]) DependencyItem.__new__.__defaults__ = (None,) else: DependencyItem = collections.namedtuple( "DependencyItem", ["path", "digest", "name"], defaults=(None,))
[docs]def depmap_is_uptodate(target_tree, relpath_file): """ Given a dictionary of dependency data, return true if all of the files listed are unchanged since we last ran the scan.f """ relpath_depmap = relpath_file + DEPENDENCY_SUFFIX depmap_path = os.path.join(target_tree, relpath_depmap) if not os.path.exists(depmap_path): return False if not os.path.exists(depmap_path + ".sha1"): return False depmap_mtime = os.path.getmtime(depmap_path) if os.path.getmtime(depmap_path + ".sha1") < depmap_mtime: logger.warning("depmap mtime is later than it's sha1") return False with open(depmap_path, "r") as infile: depmap_data = json.load(infile) for item in depmap_data: item = DependencyItem(**item) if item.path.startswith("/"): if not os.path.exists(item.path): logger.debug("%s disappeared", item.path) return False # The dependency is an absolute path, which means that it is outside # the source tree. We don't have a digest cache of this file so if # it's timestamp indictes it is newer we must act on taht. if os.path.getmtime(item.path) > depmap_mtime: logger.debug("%s is newer", item.path) return False continue digest_path = os.path.join(target_tree, item.path + ".sha1") if not os.path.exists(digest_path): # Digest file does not exist, but corresponding source file is in our # source tree... so it must have been excluded during scan if not os.path.exists(item.path): logger.debug("%s disappeared", item.path) return False if os.path.getmtime(item.path) > depmap_mtime: logger.debug("%s is newer", item.path) return False continue if os.path.getmtime(digest_path) < depmap_mtime: # The dependency map is newer than this particular file, so this # file does not invalidate it continue with open(digest_path) as infile: digest = infile.read().strip() if digest == item.digest: # The timestamp on this file is newer than the digest, but the file # content is unchanged, so thsi file does not invalidate it continue # The timestamp is newer and it's content has changed. The dependency # map is out of date. return False return True
[docs]def map_dependencies(source_tree, target_tree, source_relpath): """ Get a dependency list from the sourcefile. Writeout the dependency file and it's sha1 digest. """ targetpath = os.path.join(target_tree, source_relpath) + DEPENDENCY_SUFFIX with open(targetpath, "w") as outfile: subprocess.check_call( [sys.executable, "-Bm", "makelint.get_dependencies", "--module-relpath", source_relpath, "--source-tree", source_tree, "--target-tree", target_tree ], stdout=outfile, stderr=subprocess.DEVNULL) digest_file(targetpath, targetpath + ".sha1")
[docs]def map_sourcetree_dependencies(source_tree, target_tree, progress, njobs): """ During this phase each tracked source file is indexed to get a complete dependency footprint. Note that this is done by importing each module file in a clean interpreter process, and then inspecting the `__file__` attribute of all modules loaded by interpreter. """ progress(tool_idx=2, tool="depmap") pidset = set() file_idx = 0 for target_cwd, dirnames, filenames in os.walk(target_tree): dirnames[:] = sorted(dirnames) # stable walk relpath_cwd = os.path.relpath(target_cwd, target_tree) if relpath_cwd == ".": # NOTE(josh): os.path.join("", "foo") == "foo" relpath_cwd = "" manifest_path = os.path.join(target_cwd, MANIFEST_FILENAME) with open(manifest_path) as infile: filenames = list(line.strip() for line in infile) for filename in sorted(filenames): file_idx += 1 progress(file_idx=file_idx) relpath_file = os.path.join(relpath_cwd, filename) if not depmap_is_uptodate(target_tree, relpath_file): logger.debug("Mapping dependencies: %s", relpath_file) waitforsize(pidset, njobs - 1) pid = os.fork() if pid == 0: map_dependencies(source_tree, target_tree, relpath_file) os._exit(0) # pylint: disable=protected-access pidset.add(pid) waitforsize(pidset, 0)
[docs]def toolstamp_is_uptodate(toolstamp_path, depmap_path): """ Return true if the toolstamp is up to date with respect to the dependency map """ digest_path = depmap_path + ".sha1" if not os.path.exists(toolstamp_path): return False if os.path.getmtime(toolstamp_path) > os.path.getmtime(depmap_path): # The tool execution stamp is newer than the dependency map digest # so we know that it is up to date return True with open(toolstamp_path) as infile: toolstamp_digest = infile.read().strip() with open(digest_path) as infile: depmap_digest = infile.read().strip() # If the current dependency map digest matches the dependency map digest # when the tool was last executed, then the dependency footprint has not # changed (nor the source file itself) so the tool stamp is up to date. return toolstamp_digest == depmap_digest
[docs]def cat_log(logfile_path, header, merged_log): """ Copy the content from logfile_path into merged_log """ if not merged_log: return merged_log.write(header) merged_log.write("\n") merged_log.write("=" * len(header)) merged_log.write("\n") with open(logfile_path) as infile: for line in infile: merged_log.write(line) merged_log.write("\n\n")
[docs]def execute_tool_ontree( source_tree, target_tree, tool, env, fail_fast, merged_log, progress, njobs): """ Execute the given tool """ progress(tool_idx=progress.tool_idx + 1, tool=tool.name) pidset = set() file_idx = 0 output = 0 for target_cwd, dirnames, filenames in os.walk(target_tree): dirnames[:] = sorted(dirnames) # stable walk relpath_cwd = os.path.relpath(target_cwd, target_tree) if relpath_cwd == ".": # NOTE(josh): os.path.join("", "foo") == "foo" relpath_cwd = "" manifest_path = os.path.join(target_cwd, MANIFEST_FILENAME) with open(manifest_path) as infile: filenames = list(line.strip() for line in infile) for filename in sorted(filenames): file_idx += 1 progress(file_idx=file_idx) source_relpath = os.path.join(relpath_cwd, filename) toolstamp_path = tool.get_stamp(target_cwd, filename) depmap_path = os.path.join(target_cwd, filename + DEPENDENCY_SUFFIX) logfile_path = toolstamp_path + ".log" if toolstamp_is_uptodate(toolstamp_path, depmap_path): with open(toolstamp_path) as infile: content = infile.read().strip() if content == "fail": output |= 1 header = "{} (cached)".format(source_relpath) cat_log(logfile_path, header, merged_log) if fail_fast: return output else: if os.path.exists(toolstamp_path): os.remove(toolstamp_path) output |= waitforsize(pidset, njobs - 1) if fail_fast and output: output |= waitforsize(pidset, 0) return output pid = os.fork() if pid != 0: pidset.add(pid) continue # Child process with open(logfile_path, "w") as outfile: result = tool.execute(source_tree, source_relpath, env, outfile) if result == 0: logger.debug("%s: okay!", toolstamp_path) shutil.copyfile(depmap_path + ".sha1", toolstamp_path) os.remove(logfile_path) else: with open(toolstamp_path, "w") as outfile: outfile.write("fail") logger.info("%s: failed :(", toolstamp_path) # NOTE(josh): we have multiple processes catting to this file, so # we need serialize the cat operation to prevent interleaving. fcntl.flock(merged_log, fcntl.LOCK_EX) cat_log(logfile_path, source_relpath, merged_log) fcntl.flock(merged_log, fcntl.LOCK_UN) merged_log.close() os._exit(result) # pylint: disable=protected-access output |= waitforsize(pidset, 0) return output
[docs]def get_progress_bar(numchars, fraction=None, percent=None): """ Return a high resolution unicode progress bar """ if percent is not None: fraction = percent / 100.0 if fraction >= 1.0: return "█" * numchars blocks = [" ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█"] length_in_chars = fraction * numchars n_full = int(length_in_chars) i_partial = int(8 * (length_in_chars - n_full)) n_empty = max(numchars - n_full - 1, 0) return ("█" * n_full) + blocks[i_partial] + (" " * n_empty)
[docs]class ProgressReporter(object): """ Prints a status message """ def __init__(self): self.ndirs = 0 self.nfiles = 0 self.ntools = 0 self.dir_idx = 0 self.tool_idx = 0 self.file_idx = 0 self.tool = "" self.lastprint = 0 self.toolnames = [""] * 10 def __call__(self, **kwargs): rewind = kwargs.pop("rewind", True) force = kwargs.pop("force", False) if "tool_idx" in kwargs and "tool" in kwargs: self.toolnames[kwargs["tool_idx"]] = kwargs["tool"] for key, value in kwargs.items(): setattr(self, key, value) if time.time() - self.lastprint < 0.1 and not force: return self.lastprint = time.time() nlines = self.do_print() if rewind: # Move back up three lines sys.stdout.write("\x1b[{}F".format(nlines)) sys.stdout.flush()
[docs] def get_nsteps(self): """ Return the total number of steps to completion """ return self.nfiles + (self.ntools * self.nfiles)
[docs] def get_istep(self): """ Return the index of our current step """ return (self.tool_idx * self.nfiles) + self.file_idx
[docs] def get_progress(self): """ Return current progress as a percentage """ if self.get_nsteps() == 0: return 0 return 100.0 * self.get_istep() / self.get_nsteps()
[docs] def do_print(self): nlines = 0 sys.stdout.write( "{:>10s}: {:5d}/{:<5d} [{}] {:6.2f}%" .format("Total", self.get_istep(), self.get_nsteps(), get_progress_bar(20, percent=self.get_progress()), self.get_progress()) ) sys.stdout.write("\x1b[0K\n") # clear the rest of the line nlines += 1 progress = 0.0 if self.ndirs > 0: progress = 100.0 * self.dir_idx / self.ndirs sys.stdout.write( "{:>10s}: {:5d}/{:<5d} [{}] {:6.2f}%" .format("Indexing", self.dir_idx, self.ndirs, get_progress_bar(20, percent=progress), progress)) sys.stdout.write("\x1b[0K\n") # clear the rest of the line nlines += 1 for idx in range(1, self.tool_idx): sys.stdout.write( "{0:>10s}: {1:5d}/{1:<5d} [{2}] {3:6.2f}%" .format(self.toolnames[idx], self.nfiles, get_progress_bar(20, 1.0), 100.0)) sys.stdout.write("\x1b[0K\n") nlines += 1 if self.tool_idx > 0: progress = 0.0 if self.nfiles > 0: progress = 100.0 * self.file_idx / self.nfiles sys.stdout.write( "{:>10s}: {:5d}/{:<5d} [{}] {:6.2f}%" .format(self.tool, self.file_idx, self.nfiles, get_progress_bar(20, percent=progress), progress)) sys.stdout.write("\x1b[0K\n") # clear the rest of the line nlines += 1 return nlines
[docs]class NullProgressReport(object): """ No-op for quiet mode """ def __call__(self, *kwargs): pass