Source code for pyvcsshark.parser.gitparser

import sys
import os
import logging
import re
import uuid
import multiprocessing

import pygit2

from pyvcsshark.parser.baseparser import BaseParser
from pyvcsshark.parser.models import BranchModel, PeopleModel, TagModel, FileModel, CommitModel, Hunk, BranchTipModel


[docs]class GitParser(BaseParser): """ Parser for git repositories. The general parsing process is described in :func:`pyvcsshark.parser.gitparser.GitParser.parse`. :property SIMILARITY_THRESHOLD: sets the threshold for deciding if a file is similar to another. Default: 50% :func:`multiprocessing.cpu_count()`. :property repository: object of class :class:`pygit2.Repository`, which represents the repository :property commits_to_be_processed: dictionary that is set up the following way: \ commits_to_be_processed = {'<revisionHash>' : {'branches' : set(), 'tags' : []}}, where <revisionHash> must be\ replaced with the actual hash. Therefore, this dictionary holds information about every revision and which branches\ this revision belongs to and which tags it has. :property logger: logger, which is acquired via logging.getLogger("parser") :property datastore: datastore, where the commits should be saved to :property commit_queue: object of class :class:`multiprocessing.JoinableQueue`, where commits are stored in that\ can be parsed """ # Includes rename and copy threshold, 50% is the default git threshold SIMILARITY_THRESHOLD = 50 def __init__(self): self.repository = None self.commits_to_be_processed = {} self.logger = logging.getLogger("parser") self.datastore = None self.commit_queue = multiprocessing.JoinableQueue() @property def repository_type(self): return 'git'
[docs] def get_project_url(self): """ Returns the url of the project, which is processed """ url = "local/"+str(uuid.uuid4()) try: url = self.repository.remotes["origin"].url except KeyError: # repository is only local pass return url
[docs] def finalize(self): """Finalization process for parser""" return
[docs] def detect(self, repository_path): """Try to detect the repository, if its not there an exception is raised and therfore false can be returned""" # new versions of pygit2 just set backend to None if repo does not exist if not os.path.exists(repository_path): return False try: discovered_path = pygit2.discover_repository(repository_path) self.repository = pygit2.Repository(discovered_path) return True except Exception: return False
[docs] def add_branch(self, commit_hash, branch): """ Does two things: First it adds the commitHash to the commitqueue, so that the parsing processes can process this commit. Second it creates objects of type :class:`pyvcsshark.parser.models.BranchModel` and stores it in the dictionary. :param commit_hash: revision hash of the commit to be processed :param branch: branch that should be added for the commit """ string_commit_hash = str(commit_hash) if branch is None: branch_model = None else: branch_model = BranchModel(branch) # If the commit is already in the dict, we only need to append the branch (because then it was already parsed) if string_commit_hash in self.commits_to_be_processed: self.commits_to_be_processed[string_commit_hash]['branches'].add(branch_model) else: self.commit_queue.put(string_commit_hash) self.commits_to_be_processed[string_commit_hash] = {'branches': {branch_model}, 'tags': []}
[docs] def add_tag(self, tagged_commit, tag_name, tag_object): """ Creates objects of type :class:`pyvcsshark.parser.models.TagModel` and stores it in the dictionary mentioned above. :param tagged_commit: revision hash of the commit to be processed :param tag_name: name of the tag that should be added :param tag_object: in git it is possible to annotate tags. If a tag is annotated, we get a tag object of class :class:`pygit2.Tag` .. NOTE:: It can happen, that people committed to a tag and therefore created \ a "tag-branch" which is normally not possible in git. Therefore, we go through all tags and check \ if they respond to a commit, which is already in the dictionary. \ If **yes** -> we **tag** that commit \ If **no** -> we **ignore** it """ commit_id = str(tagged_commit.id) tag_name = tag_name.split("/")[-1] # If we have an annotated tag, get all the information we can out of it if isinstance(tag_object, pygit2.Tag): # in some cases (jspwiki) there are taggers where the name or email contains non utf-8 chars. # In these cases we replace them with the utf-8 replacement character try: name = tag_object.tagger.name except UnicodeDecodeError as e: name = tag_object.tagger.raw_name.decode('utf-8', 'replace') try: email = tag_object.tagger.email except UnicodeDecodeError as e: email = tag_object.tagger.raw_email.decode('utf-8', 'replace') people_model = PeopleModel(name, email) tag_model = TagModel(tag_name, getattr(tag_object, 'message', None), people_model, tag_object.tagger.time, tag_object.tagger.offset) else: tag_model = TagModel(tag_name) # As it can happen that we have commits with tags that are not on any branch (e.g. project Zookeeper), we need # to take care of that here if commit_id in self.commits_to_be_processed: self.commits_to_be_processed[commit_id]['tags'].append(tag_model) else: self.commits_to_be_processed[commit_id] = {'branches': set([]), 'tags': [tag_model]} self.commit_queue.put(commit_id)
def _set_branch_tips(self, branches): """This sets the tips (last commits) for all remote branches. Normally we would also multiprocess here but as this is a quick operation we do not need the additional overhead of defining a new BranchParserProcess. """ self.branches = {} for branch_name in list(self.repository.branches.remote): if str(branch_name).lower().startswith('origin/'): branch = self.repository.branches.remote[branch_name] if branch_name != 'origin/HEAD': # print('head: {}'.format(branch.target.replace('refs/remotes/', '')) self.branches[branch_name] = {'target': str(branch.target), 'is_origin_head': False} # set origin_head we know its there and that it has a target that we also know om = self.repository.branches['origin/HEAD'] om_target = om.target.replace('refs/remotes/', '') self.branches[om_target]['is_origin_head'] = True
[docs] def initialize(self): """ Initializes the parser. It gets all the branch and tag information and puts it into two different locations: First the commit id is put into the commitqueue for the processing with the parsing processes. Second a dictionary is created, which holds the information of which branches a commit is on and which tags it has """ # Get all references (branches, tags) references = set(self.repository.listall_references()) # Get all tags regex = re.compile('^refs/tags') tags = set(filter(lambda r: regex.match(r), self.repository.listall_references())) # Get all branches branches = references - tags # set all tips for every branch self._set_branch_tips(branches) self.logger.info("Getting branch information...") for branch in branches: self.logger.info("Getting information from branch %s" % (branch)) commit = self.repository.lookup_reference(branch).peel() # Walk through every child for child in self.repository.walk(commit.id, pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_TOPOLOGICAL): self.add_branch(child.id, branch) self.logger.info("Getting tags...") # Walk through every tag and put the information in the dictionary via the addtag method for tag in tags: reference = self.repository.lookup_reference(tag) tag_object = self.repository[reference.target.hex] tagged_commit = self.repository.lookup_reference(tag).peel() # we exclude Blobs if isinstance(tagged_commit, pygit2.Blob): continue self.add_tag(tagged_commit, tag, tag_object) # The tagged_commit can have children that are not on any branch, but we may need it anyway --> collect it # and add it only if we have not collected it before try: for child in self.repository.walk(tagged_commit.id, pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_TOPOLOGICAL): if str(child.id) not in self.commits_to_be_processed: self.add_branch(child.id, None) except ValueError as e: # we may hit a tag that does not point to a commit but to a blob, therefore we can not walk over it until libgit implements this # see: https://github.com/libgit2/libgit2/issues/3595 if str(e) != 'ValueError: object is not a committish': # we do not bail on this we just ignore tags to blobs raise
[docs] def parse(self, repository_path, datastore, cores_per_job): """ Parses the repository, which is located at the repository_path and save the parsed commits in the datastore, by calling the :func:`pyvcsshark.datastores.basestore.BaseStore.add_commit` method of the chosen datastore. It mostly uses pygit2 (see: http://www.pygit2.org/). The parsing process is divided into several steps: 1. A list of all branches and tags are created 2. All branches and tags are parsed. So we create dictionary of all commits with their corresponding tags\ and branches and add all revision hashes to the commitqueue 3. Add the poison pills for terminating of the parsing process to the commit_queue 4. Create processes of class :class:`pyvcsshark.parser.gitparser.CommitParserProcess`, which parse all\ commits. :param repository_path: Path to the repository :param datastore: Datastore used to save the data to """ self.datastore = datastore self.logger.info("Starting parsing process...") # first we want the branches queue filled for name, val in self.branches.items(): self.datastore.add_branch(BranchTipModel(name, val['target'], val['is_origin_head'])) # Set up the poison pills for i in range(cores_per_job): self.commit_queue.put(None) # Parsing all commits of the queue self.logger.info("Parsing commits...") lock = multiprocessing.Lock() for i in range(cores_per_job): thread = CommitParserProcess(self.commit_queue, self.commits_to_be_processed, self.repository, self.datastore, lock) thread.daemon = True thread.start() self.commit_queue.join() self.logger.info("Parsing complete...") return
[docs]class CommitParserProcess(multiprocessing.Process): """ A process, which inherits from :class:`multiprocessing.Process`, that will parse the branches it gets from the queue and call the :func:`pyvcsshark.datastores.basestore.BaseStore.addCommit` function to add the commits :property logger: logger acquired by calling logging.getLogger("parser") :param queue: queue, where the different commithashes are stored in :param commits_to_be_processed: dictionary, which contains information about the branches and tags of each commit :param repository: repository object of type :class:`pygit2.Repository` :param datastore: object, that is a subclass of :class:`pyvcsshark.datastores.basestore.BaseStore` :param lock: lock that is used, so that only one process at a time is calling \ the :func:`pyvcsshark.datastores.basestore.BaseStore.addCommit` function """ def __init__(self, queue, commits_to_be_processed, repository, datastore, lock): multiprocessing.Process.__init__(self) self.queue = queue self.commits_to_be_processed = commits_to_be_processed self.datastore = datastore self.logger = logging.getLogger("parser") self.repository = repository self.lock = lock
[docs] def run(self): """ The process gets a commit out of the queue and processes it. We use the poisonous pill technique here. Means, our queue has #Processes times "None" in it in the end. If a process encounters that None, he will stop and terminate. """ while True: next_task = self.queue.get() # If process pulls the poisoned pill, he exits if next_task is None: self.queue.task_done() break commitHash = pygit2.Oid(hex=next_task) commit = self.repository[commitHash] self.parse_commit(commit) self.queue.task_done() return
[docs] def parse_commit(self, commit): """ Function for parsing a commit. 1. changedFiles are created (type: list of :class:`pyvcsshark.parser.models.FileModel`) 2. author and commiter are created (type: :class:`pyvcsshark.parser.models.PeopleModel`) 3. parents are added (list of strings) 4. commit model is created (type: :class:`pyvcsshark.parser.models.CommitModel`) 5. :func:`pyvcsshark.datastores.basestore.BaseStore.addCommit` is called :param commit: commit object of type :class:`pygit2.Commit` .. NOTE:: The call to :func:`pyvcsshark.datastores.basestore.BaseStore.addCommit` is thread/process safe, as a\ lock is used to regulate the calls """ # we do not want Blobs (for now) if commit.__class__.__name__ == 'Blob': del self.commits_to_be_processed[str(commit.id)] return # If there are parents, we need to get the normal changed files, if not we need to get the files for initial # commit if commit.parents: changed_files = [] for parent in commit.parents: changed_files += self.get_changed_files_with_similiarity(parent, commit) else: changed_files = self.get_changed_files_for_initial_commit(commit) string_commit_hash = str(commit.id) # Create the different models author_model = PeopleModel(commit.author.name, commit.author.email) committer_model = PeopleModel(commit.committer.name, commit.committer.email) parent_ids = [str(parentId) for parentId in commit.parent_ids] commit_model = CommitModel(string_commit_hash, self.commits_to_be_processed[string_commit_hash]['branches'], self.commits_to_be_processed[string_commit_hash]['tags'], parent_ids, author_model, committer_model, commit.message, changed_files, commit.author.time, commit.author.offset, commit.committer.time, commit.committer.offset) # Make sure, that addCommit is only called by one process at a time self.lock.acquire() self.datastore.add_commit(commit_model) self.lock.release() del self.commits_to_be_processed[string_commit_hash]
[docs] def create_hunks(self, hunks, initial_commit=False): """ Creates the diff in the unified format (see: https://en.wikipedia.org/wiki/Diff#Unified_format) If we have the initial commit, we need to turn around the hunk.* attributes. :param hunks: list of objects of class :class:`pygit2.DiffHunk` :param initial_commit: indicates if we have an initial commit """ list_of_hunks = [] for hunk in hunks: output = "" if initial_commit: for line in hunk.lines: output += "%s%s" % ('+', line.content) gen_hunk = Hunk(hunk.old_start, hunk.old_lines, hunk.new_start, hunk.new_lines, output) else: for line in hunk.lines: output += "%s%s" % (line.origin, line.content) gen_hunk = Hunk(hunk.new_start, hunk.new_lines, hunk.old_start, hunk.old_lines, output) list_of_hunks.append(gen_hunk) return list_of_hunks
[docs] def get_changed_files_for_initial_commit(self, commit): """ Special function for the initial commit, as we need to diff against the empty tree. Creates the changed files list, where objects of class :class:`pyvcsshark.parser.models.FileModel` are added. For every changed file in the initial commit. :param commit: commit of type :class:`pygit2.Commit` """ changed_files = [] diff = commit.tree.diff_to_tree(context_lines=0, interhunk_lines=1) for patch in diff: changed_file = FileModel(patch.delta.old_file.path, patch.delta.old_file.size, patch.line_stats[2], patch.line_stats[1], patch.delta.is_binary, 'A', self.create_hunks(patch.hunks, True)) changed_files.append(changed_file) return changed_files
[docs] def get_changed_files_with_similiarity(self, parent, commit): """ Creates a list of changed files of the class :class:`pyvcsshark.parser.models.FileModel`. For every changed file in the commit such an object is created. Furthermore, hunks are saved an each file is tested for similarity to detect copy and move operations :param parent: Object of class :class:`pygit2.Commit`, that represents the parent commit :param commit: Object of class :class:`pygit2.Commit`, that represents the child commit """ changed_files = [] diff = self.repository.diff(parent, commit, context_lines=0, interhunk_lines=1) opts = pygit2.GIT_DIFF_FIND_RENAMES | pygit2.GIT_DIFF_FIND_COPIES diff.find_similar(opts, GitParser.SIMILARITY_THRESHOLD, GitParser.SIMILARITY_THRESHOLD) already_checked_file_paths = set() for patch in diff: # Only if the filepath was not processed before, add new file if patch.delta.new_file.path in already_checked_file_paths: continue # Check change mode mode = 'X' if patch.delta.status == 1: mode = 'A' elif patch.delta.status == 2: mode = 'D' elif patch.delta.status == 3: mode = 'M' elif patch.delta.status == 4: mode = 'R' elif patch.delta.status == 5: mode = 'C' elif patch.delta.status == 6: mode = 'I' elif patch.delta.status == 7: mode = 'U' elif patch.delta.status == 8: mode = 'T' changed_file = FileModel(patch.delta.new_file.path, patch.delta.new_file.size, patch.line_stats[1], patch.line_stats[2], patch.delta.is_binary, mode, self.create_hunks(patch.hunks), parent_revision_hash=str(parent.id)) # only add oldpath if file was copied/renamed if mode in ['C', 'R']: changed_file.oldPath = patch.delta.old_file.path already_checked_file_paths.add(patch.delta.new_file.path) changed_files.append(changed_file) return changed_files