Source code for pyvcsshark.parser.gitparser

import sys
import os
import logging
import re
import uuid
import multiprocessing

import pygit2

from pyvcsshark.parser.baseparser import BaseParser
from pyvcsshark.parser.models import BranchModel, PeopleModel, TagModel, FileModel, CommitModel, Hunk, BranchTipModel


[docs]class GitParser(BaseParser):
    """ Parser for git repositories. The general parsing process is described in
    :func:`pyvcsshark.parser.gitparser.GitParser.parse`.

    :property SIMILARITY_THRESHOLD: sets the threshold for deciding if a file is similar to another. Default: 50%
    :func:`multiprocessing.cpu_count()`.
    :property repository: object of class :class:`pygit2.Repository`, which represents the repository
    :property commits_to_be_processed: dictionary that is set up the following way: \
    commits_to_be_processed = {'<revisionHash>' : {'branches' : set(), 'tags' : []}}, where <revisionHash> must be\
    replaced with the actual hash. Therefore, this dictionary holds information about every revision and which branches\
     this revision belongs to and which tags it has.
    :property logger: logger, which is acquired via logging.getLogger("parser")
    :property datastore: datastore, where the commits should be saved to
    :property commit_queue: object of class :class:`multiprocessing.JoinableQueue`, where commits are stored in that\
    can be parsed

    """

    # Includes rename and copy threshold, 50% is the default git threshold
    SIMILARITY_THRESHOLD = 50

    def __init__(self):
        self.repository = None
        self.commits_to_be_processed = {}
        self.logger = logging.getLogger("parser")
        self.datastore = None

        self.commit_queue = multiprocessing.JoinableQueue()

    @property
    def repository_type(self):
        return 'git'

[docs]    def get_project_url(self):
        """ Returns the url of the project, which is processed """
        url = "local/"+str(uuid.uuid4())
        try:
            url = self.repository.remotes["origin"].url
        except KeyError:
            # repository is only local
            pass

        return url

[docs]    def finalize(self):
        """Finalization process for parser"""
        return

[docs]    def detect(self, repository_path):
        """Try to detect the repository, if its not there an exception is raised and therfore false can be returned"""
        # new versions of pygit2 just set backend to None if repo does not exist
        if not os.path.exists(repository_path):
            return False
        try:
            discovered_path = pygit2.discover_repository(repository_path)
            self.repository = pygit2.Repository(discovered_path)
            return True
        except Exception:
            return False

[docs]    def add_branch(self, commit_hash, branch):
        """ Does two things: First it adds the commitHash to the commitqueue, so that the parsing processes can process this commit. Second it
        creates objects of type :class:`pyvcsshark.parser.models.BranchModel` and stores it in the dictionary.

        :param commit_hash: revision hash of the commit to be processed
        :param branch: branch that should be added for the commit
        """
        string_commit_hash = str(commit_hash)

        if branch is None:
            branch_model = None
        else:
            branch_model = BranchModel(branch)

        # If the commit is already in the dict, we only need to append the branch (because then it was already parsed)
        if string_commit_hash in self.commits_to_be_processed:
            self.commits_to_be_processed[string_commit_hash]['branches'].add(branch_model)
        else:
            self.commit_queue.put(string_commit_hash)
            self.commits_to_be_processed[string_commit_hash] = {'branches': {branch_model}, 'tags': []}

[docs]    def add_tag(self, tagged_commit, tag_name, tag_object):
        """
        Creates objects of type :class:`pyvcsshark.parser.models.TagModel` and stores it in the dictionary mentioned above.

        :param tagged_commit: revision hash of the commit to be processed
        :param tag_name: name of the tag that should be added
        :param tag_object: in git it is possible to annotate tags. If a tag is annotated,
         we get a tag object of class :class:`pygit2.Tag`


        .. NOTE:: It can happen, that people committed to a tag and therefore created \
        a "tag-branch" which is normally not possible in git. Therefore, we go through all tags and check \
        if they respond to a commit, which is already in the dictionary. \
        If **yes** -> we **tag** that commit \
        If **no** -> we **ignore** it
        """
        commit_id = str(tagged_commit.id)
        tag_name = tag_name.split("/")[-1]

        # If we have an annotated tag, get all the information we can out of it
        if isinstance(tag_object, pygit2.Tag):

            # in some cases (jspwiki) there are taggers where the name or email contains non utf-8 chars.
            # In these cases we replace them with the utf-8 replacement character
            try:
                name = tag_object.tagger.name
            except UnicodeDecodeError as e:
                name = tag_object.tagger.raw_name.decode('utf-8', 'replace')
            try:
                email = tag_object.tagger.email
            except UnicodeDecodeError as e:
                email = tag_object.tagger.raw_email.decode('utf-8', 'replace')

            people_model = PeopleModel(name, email)
            tag_model = TagModel(tag_name, getattr(tag_object, 'message', None), people_model,
                                 tag_object.tagger.time, tag_object.tagger.offset)
        else:
            tag_model = TagModel(tag_name)

        # As it can happen that we have commits with tags that are not on any branch (e.g. project Zookeeper), we need
        # to take care of that here
        if commit_id in self.commits_to_be_processed:
            self.commits_to_be_processed[commit_id]['tags'].append(tag_model)
        else:
            self.commits_to_be_processed[commit_id] = {'branches': set([]), 'tags': [tag_model]}
            self.commit_queue.put(commit_id)

    def _set_branch_tips(self, branches):
        """This sets the tips (last commits) for all remote branches.

        Normally we would also multiprocess here but as this is a quick operation we do not need
        the additional overhead of defining a new BranchParserProcess.
        """
        self.branches = {}
        for branch_name in list(self.repository.branches.remote):
            if str(branch_name).lower().startswith('origin/'):
                branch = self.repository.branches.remote[branch_name]
                if branch_name != 'origin/HEAD':
                    # print('head: {}'.format(branch.target.replace('refs/remotes/', ''))
                    self.branches[branch_name] = {'target': str(branch.target), 'is_origin_head': False}

        # set origin_head we know its there and that it has a target that we also know
        om = self.repository.branches['origin/HEAD']
        om_target = om.target.replace('refs/remotes/', '')
        self.branches[om_target]['is_origin_head'] = True

[docs]    def initialize(self):
        """
        Initializes the parser. It gets all the branch and tag information and puts it into two different
        locations: First the commit id is put into the commitqueue for the processing with the parsing processes.
        Second a dictionary is created, which holds the information of which branches a commit is on and which tags it
        has
        """
        # Get all references (branches, tags)
        references = set(self.repository.listall_references())

        # Get all tags
        regex = re.compile('^refs/tags')
        tags = set(filter(lambda r: regex.match(r), self.repository.listall_references()))

        # Get all branches
        branches = references - tags

        # set all tips for every branch
        self._set_branch_tips(branches)

        self.logger.info("Getting branch information...")
        for branch in branches:
            self.logger.info("Getting information from branch %s" % (branch))
            commit = self.repository.lookup_reference(branch).peel()
            # Walk through every child
            for child in self.repository.walk(commit.id,
                                              pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_TOPOLOGICAL):
                self.add_branch(child.id, branch)

        self.logger.info("Getting tags...")
        # Walk through every tag and put the information in the dictionary via the addtag method
        for tag in tags:
            reference = self.repository.lookup_reference(tag)
            tag_object = self.repository[reference.target.hex]
            tagged_commit = self.repository.lookup_reference(tag).peel()

            # we exclude Blobs
            if isinstance(tagged_commit, pygit2.Blob):
                continue

            self.add_tag(tagged_commit, tag, tag_object)

            # The tagged_commit can have children that are not on any branch, but we may need it anyway --> collect it
            # and add it only if we have not collected it before
            try:
                for child in self.repository.walk(tagged_commit.id, pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_TOPOLOGICAL):
                    if str(child.id) not in self.commits_to_be_processed:
                        self.add_branch(child.id, None)
            except ValueError as e:
                # we may hit a tag that does not point to a commit but to a blob, therefore we can not walk over it until libgit implements this
                # see: https://github.com/libgit2/libgit2/issues/3595
                if str(e) != 'ValueError: object is not a committish':  # we do not bail on this we just ignore tags to blobs
                    raise

[docs]    def parse(self, repository_path, datastore, cores_per_job):
        """ Parses the repository, which is located at the repository_path and save the parsed commits in the
        datastore, by calling the :func:`pyvcsshark.datastores.basestore.BaseStore.add_commit` method of the chosen
        datastore. It mostly uses pygit2 (see: http://www.pygit2.org/).

        The parsing process is divided into several steps:

            1. A list of all branches and tags are created
            2. All branches and tags are parsed. So we create dictionary of all commits with their corresponding tags\
            and branches and add all revision hashes to the commitqueue
            3. Add the poison pills for terminating of the parsing process to the commit_queue
            4. Create processes of class :class:`pyvcsshark.parser.gitparser.CommitParserProcess`, which parse all\
            commits.

        :param repository_path: Path to the repository
        :param datastore: Datastore used to save the data to
        """
        self.datastore = datastore
        self.logger.info("Starting parsing process...")

        # first we want the branches queue filled
        for name, val in self.branches.items():
            self.datastore.add_branch(BranchTipModel(name, val['target'], val['is_origin_head']))

        # Set up the poison pills
        for i in range(cores_per_job):
            self.commit_queue.put(None)

        # Parsing all commits of the queue
        self.logger.info("Parsing commits...")
        lock = multiprocessing.Lock()
        for i in range(cores_per_job):
            thread = CommitParserProcess(self.commit_queue, self.commits_to_be_processed, self.repository, self.datastore,
                                         lock)
            thread.daemon = True
            thread.start()

        self.commit_queue.join()
        self.logger.info("Parsing complete...")

        return


[docs]class CommitParserProcess(multiprocessing.Process):
    """
    A process, which inherits from :class:`multiprocessing.Process`, that will parse the branches it
    gets from the queue and call the :func:`pyvcsshark.datastores.basestore.BaseStore.addCommit` function to add
    the commits

    :property logger: logger acquired by calling logging.getLogger("parser")

    :param queue: queue, where the different commithashes are stored in
    :param commits_to_be_processed: dictionary, which contains information about the branches and tags of each commit
    :param repository: repository object of type :class:`pygit2.Repository`
    :param datastore: object, that is a subclass of :class:`pyvcsshark.datastores.basestore.BaseStore`
    :param lock: lock that is used, so that only one process at a time is calling \
    the :func:`pyvcsshark.datastores.basestore.BaseStore.addCommit` function
    """

    def __init__(self, queue, commits_to_be_processed, repository, datastore, lock):
        multiprocessing.Process.__init__(self)
        self.queue = queue
        self.commits_to_be_processed = commits_to_be_processed
        self.datastore = datastore
        self.logger = logging.getLogger("parser")
        self.repository = repository
        self.lock = lock

[docs]    def run(self):
        """
        The process gets a commit out of the queue and processes it.
        We use the poisonous pill technique here. Means, our queue has #Processes times "None" in it in the end.
        If a process encounters that None, he will stop and terminate.
        """
        while True:
            next_task = self.queue.get()
            # If process pulls the poisoned pill, he exits
            if next_task is None:
                self.queue.task_done()
                break
            commitHash = pygit2.Oid(hex=next_task)
            commit = self.repository[commitHash]
            self.parse_commit(commit)
            self.queue.task_done()
        return

[docs]    def parse_commit(self, commit):
        """ Function for parsing a commit.

        1. changedFiles are created (type: list of :class:`pyvcsshark.parser.models.FileModel`)
        2. author and commiter are created (type: :class:`pyvcsshark.parser.models.PeopleModel`)
        3. parents are added (list of strings)
        4. commit model is created (type: :class:`pyvcsshark.parser.models.CommitModel`)
        5. :func:`pyvcsshark.datastores.basestore.BaseStore.addCommit` is called

        :param commit: commit object of type :class:`pygit2.Commit`

        .. NOTE:: The call to :func:`pyvcsshark.datastores.basestore.BaseStore.addCommit` is thread/process safe, as a\
        lock is used to regulate the calls
        """
        # we do not want Blobs (for now)
        if commit.__class__.__name__ == 'Blob':
            del self.commits_to_be_processed[str(commit.id)]
            return

        # If there are parents, we need to get the normal changed files, if not we need to get the files for initial
        # commit
        if commit.parents:
            changed_files = []
            for parent in commit.parents:
                changed_files += self.get_changed_files_with_similiarity(parent, commit)
        else:
            changed_files = self.get_changed_files_for_initial_commit(commit)

        string_commit_hash = str(commit.id)

        # Create the different models
        author_model = PeopleModel(commit.author.name, commit.author.email)
        committer_model = PeopleModel(commit.committer.name, commit.committer.email)
        parent_ids = [str(parentId) for parentId in commit.parent_ids]
        commit_model = CommitModel(string_commit_hash, self.commits_to_be_processed[string_commit_hash]['branches'],
                                   self.commits_to_be_processed[string_commit_hash]['tags'], parent_ids,
                                   author_model, committer_model, commit.message, changed_files, commit.author.time,
                                   commit.author.offset, commit.committer.time, commit.committer.offset)

        # Make sure, that addCommit is only called by one process at a time
        self.lock.acquire()
        self.datastore.add_commit(commit_model)
        self.lock.release()

        del self.commits_to_be_processed[string_commit_hash]

[docs]    def create_hunks(self, hunks, initial_commit=False):
        """
        Creates the diff in the unified format (see: https://en.wikipedia.org/wiki/Diff#Unified_format)

        If we have the initial commit, we need to turn around the hunk.* attributes.

        :param hunks: list of objects of class :class:`pygit2.DiffHunk`
        :param initial_commit: indicates if we have an initial commit
        """

        list_of_hunks = []

        for hunk in hunks:
            output = ""
            if initial_commit:
                for line in hunk.lines:
                    output += "%s%s" % ('+', line.content)
                gen_hunk = Hunk(hunk.old_start, hunk.old_lines, hunk.new_start, hunk.new_lines, output)
            else:
                for line in hunk.lines:
                    output += "%s%s" % (line.origin, line.content)
                gen_hunk = Hunk(hunk.new_start, hunk.new_lines, hunk.old_start, hunk.old_lines, output)
            list_of_hunks.append(gen_hunk)
        return list_of_hunks

[docs]    def get_changed_files_for_initial_commit(self, commit):
        """
        Special function for the initial commit, as we need to diff against the empty tree. Creates
        the changed files list, where objects of class :class:`pyvcsshark.parser.models.FileModel` are added.
        For every changed file in the initial commit.

        :param commit: commit of type :class:`pygit2.Commit`
        """
        changed_files = []
        diff = commit.tree.diff_to_tree(context_lines=0, interhunk_lines=1)

        for patch in diff:
            changed_file = FileModel(patch.delta.old_file.path, patch.delta.old_file.size,
                                     patch.line_stats[2], patch.line_stats[1],
                                     patch.delta.is_binary, 'A',
                                     self.create_hunks(patch.hunks, True))
            changed_files.append(changed_file)
        return changed_files

[docs]    def get_changed_files_with_similiarity(self, parent, commit):
        """ Creates a list of changed files of the class :class:`pyvcsshark.parser.models.FileModel`. For every
        changed file in the commit such an object is created. Furthermore, hunks are saved an each file is tested for
        similarity to detect copy and move operations

        :param parent: Object of class :class:`pygit2.Commit`, that represents the parent commit
        :param commit: Object of class :class:`pygit2.Commit`, that represents the child commit
        """

        changed_files = []
        diff = self.repository.diff(parent, commit, context_lines=0, interhunk_lines=1)

        opts = pygit2.GIT_DIFF_FIND_RENAMES | pygit2.GIT_DIFF_FIND_COPIES
        diff.find_similar(opts, GitParser.SIMILARITY_THRESHOLD, GitParser.SIMILARITY_THRESHOLD)

        already_checked_file_paths = set()
        for patch in diff:

            # Only if the filepath was not processed before, add new file
            if patch.delta.new_file.path in already_checked_file_paths:
                continue

            # Check change mode
            mode = 'X'
            if patch.delta.status == 1:
                mode = 'A'
            elif patch.delta.status == 2:
                mode = 'D'
            elif patch.delta.status == 3:
                mode = 'M'
            elif patch.delta.status == 4:
                mode = 'R'
            elif patch.delta.status == 5:
                mode = 'C'
            elif patch.delta.status == 6:
                mode = 'I'
            elif patch.delta.status == 7:
                mode = 'U'
            elif patch.delta.status == 8:
                mode = 'T'

            changed_file = FileModel(patch.delta.new_file.path, patch.delta.new_file.size,
                                     patch.line_stats[1], patch.line_stats[2],
                                     patch.delta.is_binary, mode,
                                     self.create_hunks(patch.hunks), parent_revision_hash=str(parent.id))

            # only add oldpath if file was copied/renamed
            if mode in ['C', 'R']:
                changed_file.oldPath = patch.delta.old_file.path

            already_checked_file_paths.add(patch.delta.new_file.path)
            changed_files.append(changed_file)
        return changed_files