code update

This commit is contained in:
guosimiao 2025-01-02 11:22:44 +08:00
parent a1c5fbc83b
commit 8b473e65a1
14 changed files with 1218 additions and 20 deletions

View File

@ -0,0 +1,7 @@
* Serving Flask app 'CloneDetectionAPI'
* Debug mode: off
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
* Running on all addresses (0.0.0.0)
* Running on http://127.0.0.1:5000
* Running on http://10.29.128.27:5000
Press CTRL+C to quit

View File

@ -0,0 +1,304 @@
# aim: The ColdStartPerception service for OSCH
# author: zhangxunhui
# date: 2022-04-23
import os
import queue
import sys
import threading
from typing import List
import hashlib
from ChangedMethodExtractor import ChangedMethodExtractor
from dulwich.objects import Blob, Commit, Tag, Tree
from dulwich.repo import Repo
from dulwich.walk import WalkEntry
from ESUtils import ESUtils
from models.RepoInfo import RepoInfo
from MySQLUtils import MySQLUtils
from services.utils import read_config
class HandleRepository(object):
def __init__(self, repoInfo: RepoInfo, config: dict, es_utils: ESUtils, type: str):
self.config = config
self.repoInfo = repoInfo
self.repo = Repo(self.repoInfo.repo_path)
self.type = type
if self.type == "gitea":
self.repoInfo.ownername = self.repo.path.split("/")[-2]
self.repoInfo.reponame = self.repo.path.split("/")[-1].split(".")[0]
self.mysql_utils = MySQLUtils(
host=self.config["mysql"]["host"],
port=self.config["mysql"]["port"],
username=self.config["mysql"]["username"],
password=self.config["mysql"]["password"],
database=self.config["mysql"]["database"],
autocommit=False,
dictcursor=True,
)
repo_id = self.mysql_utils.get_repo_id(
self.repoInfo.ownername, self.repoInfo.reponame
)
if repo_id is None:
raise Exception(
"HandleRepository Error: cannot find the id of repository: {repository_path}".format(
repository_path=self.repository_path
)
)
else:
self.repoInfo.repo_id = repo_id["id"]
elif self.type == "local":
self.repoInfo.repo_id = self.hash_path_to_id(self.repoInfo.repo_path)
self.es_utils = es_utils
self.handled_commits = self.es_utils.get_handled_commits(
repo_id=self.repoInfo.repo_id,
index_name=self.config["elasticsearch"]["index_handled_commits"],
)
# local repo id
def hash_path_to_id(self, path):
hash_hex = hashlib.sha256(path.encode()).hexdigest()
return hash_hex
def run(self):
"""Get all the commits."""
print(
"[Info]: Handling repository {repo_path}".format(
repo_path=self.repo.path
)
)
commits: List[Commit] = []
object_store = self.repo.object_store
object_shas = list(iter(object_store))
for object_sha in object_shas:
obj = object_store[object_sha]
if (
isinstance(obj, Tag)
or isinstance(obj, Blob)
or isinstance(obj, Tree)
):
pass
elif isinstance(obj, Commit):
commits.append(obj)
else:
raise Exception("HandleRepository.run Error: unknown type!")
if self.type == 'gitea':
"""Whether this repository is forked or original"""
info = self.mysql_utils.get_repo_info(repo_id=self.repoInfo.repo_id)
is_fork = False
if info is not None and info["is_fork"] == 1:
is_fork = True
if is_fork:
# eliminate the forked commits
fork_id = info["fork_id"]
origin_info = self.mysql_utils.get_repo_info(repo_id=fork_id)
if origin_info is not None:
origin_ownername = origin_info["owner_name"]
origin_reponame = origin_info["name"]
origin_repo_path = os.path.join(
self.config["gitea"]["repositories_path"],
origin_ownername,
origin_reponame + ".git",
)
origin_repo = Repo(origin_repo_path)
origin_commits: List[Commit] = []
origin_object_store = origin_repo.object_store
origin_object_shas = list(iter(origin_object_store))
for object_sha in origin_object_shas:
obj = origin_object_store[object_sha]
if isinstance(obj, Commit):
origin_commits.append(obj)
commits = list(set(commits) - set(origin_commits))
else:
pass # origin repo not found in gitea mysql database
else:
pass # not a fork repo
else:
pass # local repo dont forked
"""Handle each commit."""
for commit in commits:
if commit.id.decode() not in self.handled_commits:
HandleCommit(
repo=self.repo,
repoInfo=self.repoInfo,
commit=commit,
config=self.config,
es_utils=self.es_utils,
).run()
else:
continue
class HandleRepoThread(threading.Thread):
def __init__(
self,
name: str,
q: queue.Queue,
config: dict,
type: str,
):
threading.Thread.__init__(self)
self.name = name
self.q = q
self.config = config
self.es_utils = ESUtils(config=self.config)
self.type = type
def run(self):
print("[Info]: Start thread: " + self.name)
while not self.q.empty():
repoInfo = self.q.get()
# handle one repository(local or gitea)
handler = HandleRepository(
repoInfo=repoInfo,
config=self.config,
es_utils=self.es_utils,
type = self.type
)
handler.run()
self.q.task_done()
print("[Info]: Exist thread: " + self.name)
class HandleCommit(object):
def __init__(
self,
repo: Repo,
repoInfo: RepoInfo,
commit: Commit,
config: dict,
es_utils: ESUtils,
):
self.repo = repo
self.repoInfo = repoInfo
self.commit = commit
self.config = config
self.es_utils = es_utils
def run(self):
commit_sha = self.commit.id.decode()
print(
"[Info]: Handling commit {commit_sha}".format(
commit_sha=commit_sha
)
)
"""Generate all the changes for this commit."""
walk_entry = WalkEntry(
self.repo.get_walker(include=[self.commit.id]), self.commit
)
t_changes = walk_entry.changes() # get all the TreeChange objects
if len(self.commit.parents) > 1:
t_changes = [item for t_cs in t_changes for item in t_cs]
changed_methods = ChangedMethodExtractor(
repo=self.repo,
repoInfo=self.repoInfo,
commit=self.commit,
t_changes=t_changes,
config=self.config,
).parse()
es_data_bulk = self.es_utils.extract_es_infos(
changed_methods=changed_methods
)
self.es_utils.insert_es_bulk(es_data_bulk)
"""Finish handling this commit, insert into the handled_commit index in es."""
es_data = {"repo_id": self.repoInfo.repo_id, "commit_sha": commit_sha}
self.es_utils.insert_es_item(
item=es_data,
index_name=self.config["elasticsearch"]["index_handled_commits"],
)
def handle_repositories(repositories_path: str, type: str, config: dict):
"""Handle all the repositories in the directory."""
es_utils = ESUtils(config=config)
es_utils.create_n_gram_index()
es_utils.create_handled_commit_index()
"""Handle repositories by multiple threads."""
workQueue = queue.Queue()
if type == "local":
repo_git_paths = [f.path for f in os.scandir(repositories_path) if f.is_dir()]
for repo_git_path in repo_git_paths:
workQueue.put(RepoInfo(repo_path=repo_git_path))
elif type == "gitea":
# iterate all the ownernames
ownername_paths = [
f.path for f in os.scandir(repositories_path) if f.is_dir()
]
for ownername_path in ownername_paths:
# iterate all the repositories
repo_git_paths = [
f.path for f in os.scandir(ownername_path) if f.is_dir()
]
for repo_git_path in repo_git_paths:
# if "test1.git" not in repo_git_path:
# continue # only for test
workQueue.put(RepoInfo(repo_path=repo_git_path))
else:
print("[Error]: illegal type")
sys.exit(1)
THREADNUM = config["coldstart_service"]["THREADNUM"]
threads = []
for i in range(THREADNUM):
t = HandleRepoThread(
name="Thread-" + str(i + 1),
q=workQueue,
config=config,
type=type,
)
t.start()
threads.append(t)
for t in threads:
t.join()
def main():
config_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "config-cpp.yml"
)
config = read_config(config_path)
if config is None:
print(
"[Error]: configuration file {config_path} not found".format(
config_path=config_path
)
)
sys.exit(1)
# cold start check local data firstly
try:
local_repositories_path = config["local"]["repositories_path"]
type = 'local'
handle_repositories(repositories_path=local_repositories_path, type=type, config=config)
except KeyError:
try:
gitea_repositories_path = config["gitea"]["repositories_path"]
type = 'gitea'
handle_repositories(repositories_path=gitea_repositories_path, type=type, config=config)
except KeyError:
print("[Error]: local and gitea repositories_path configration not found")
sys.exit(1)
if __name__ == "__main__":
main()
print("Finish ColdStartPerception service")

View File

@ -0,0 +1,304 @@
# aim: The ColdStartPerception service for OSCH
# author: zhangxunhui
# date: 2022-04-23
import os
import queue
import sys
import threading
from typing import List
import hashlib
from ChangedMethodExtractor import ChangedMethodExtractor
from dulwich.objects import Blob, Commit, Tag, Tree
from dulwich.repo import Repo
from dulwich.walk import WalkEntry
from ESUtils import ESUtils
from models.RepoInfo import RepoInfo
from MySQLUtils import MySQLUtils
from services.utils import read_config
class HandleRepository(object):
def __init__(self, repoInfo: RepoInfo, config: dict, es_utils: ESUtils, type: str):
self.config = config
self.repoInfo = repoInfo
self.repo = Repo(self.repoInfo.repo_path)
self.type = type
if self.type == "gitea":
self.repoInfo.ownername = self.repo.path.split("/")[-2]
self.repoInfo.reponame = self.repo.path.split("/")[-1].split(".")[0]
self.mysql_utils = MySQLUtils(
host=self.config["mysql"]["host"],
port=self.config["mysql"]["port"],
username=self.config["mysql"]["username"],
password=self.config["mysql"]["password"],
database=self.config["mysql"]["database"],
autocommit=False,
dictcursor=True,
)
repo_id = self.mysql_utils.get_repo_id(
self.repoInfo.ownername, self.repoInfo.reponame
)
if repo_id is None:
raise Exception(
"HandleRepository Error: cannot find the id of repository: {repository_path}".format(
repository_path=self.repository_path
)
)
else:
self.repoInfo.repo_id = repo_id["id"]
elif self.type == "local":
self.repoInfo.repo_id = self.hash_path_to_id(self.repoInfo.repo_path)
self.es_utils = es_utils
self.handled_commits = self.es_utils.get_handled_commits(
repo_id=self.repoInfo.repo_id,
index_name=self.config["elasticsearch"]["index_handled_commits"],
)
# local repo id
def hash_path_to_id(self, path):
hash_hex = hashlib.sha256(path.encode()).hexdigest()
return hash_hex
def run(self):
"""Get all the commits."""
print(
"[Info]: Handling repository {repo_path}".format(
repo_path=self.repo.path
)
)
commits: List[Commit] = []
object_store = self.repo.object_store
object_shas = list(iter(object_store))
for object_sha in object_shas:
obj = object_store[object_sha]
if (
isinstance(obj, Tag)
or isinstance(obj, Blob)
or isinstance(obj, Tree)
):
pass
elif isinstance(obj, Commit):
commits.append(obj)
else:
raise Exception("HandleRepository.run Error: unknown type!")
if self.type == 'gitea':
"""Whether this repository is forked or original"""
info = self.mysql_utils.get_repo_info(repo_id=self.repoInfo.repo_id)
is_fork = False
if info is not None and info["is_fork"] == 1:
is_fork = True
if is_fork:
# eliminate the forked commits
fork_id = info["fork_id"]
origin_info = self.mysql_utils.get_repo_info(repo_id=fork_id)
if origin_info is not None:
origin_ownername = origin_info["owner_name"]
origin_reponame = origin_info["name"]
origin_repo_path = os.path.join(
self.config["gitea"]["repositories_path"],
origin_ownername,
origin_reponame + ".git",
)
origin_repo = Repo(origin_repo_path)
origin_commits: List[Commit] = []
origin_object_store = origin_repo.object_store
origin_object_shas = list(iter(origin_object_store))
for object_sha in origin_object_shas:
obj = origin_object_store[object_sha]
if isinstance(obj, Commit):
origin_commits.append(obj)
commits = list(set(commits) - set(origin_commits))
else:
pass # origin repo not found in gitea mysql database
else:
pass # not a fork repo
else:
pass # local repo dont forked
"""Handle each commit."""
for commit in commits:
if commit.id.decode() not in self.handled_commits:
HandleCommit(
repo=self.repo,
repoInfo=self.repoInfo,
commit=commit,
config=self.config,
es_utils=self.es_utils,
).run()
else:
continue
class HandleRepoThread(threading.Thread):
def __init__(
self,
name: str,
q: queue.Queue,
config: dict,
type: str,
):
threading.Thread.__init__(self)
self.name = name
self.q = q
self.config = config
self.es_utils = ESUtils(config=self.config)
self.type = type
def run(self):
print("[Info]: Start thread: " + self.name)
while not self.q.empty():
repoInfo = self.q.get()
# handle one repository(local or gitea)
handler = HandleRepository(
repoInfo=repoInfo,
config=self.config,
es_utils=self.es_utils,
type = self.type
)
handler.run()
self.q.task_done()
print("[Info]: Exist thread: " + self.name)
class HandleCommit(object):
def __init__(
self,
repo: Repo,
repoInfo: RepoInfo,
commit: Commit,
config: dict,
es_utils: ESUtils,
):
self.repo = repo
self.repoInfo = repoInfo
self.commit = commit
self.config = config
self.es_utils = es_utils
def run(self):
commit_sha = self.commit.id.decode()
print(
"[Info]: Handling commit {commit_sha}".format(
commit_sha=commit_sha
)
)
"""Generate all the changes for this commit."""
walk_entry = WalkEntry(
self.repo.get_walker(include=[self.commit.id]), self.commit
)
t_changes = walk_entry.changes() # get all the TreeChange objects
if len(self.commit.parents) > 1:
t_changes = [item for t_cs in t_changes for item in t_cs]
changed_methods = ChangedMethodExtractor(
repo=self.repo,
repoInfo=self.repoInfo,
commit=self.commit,
t_changes=t_changes,
config=self.config,
).parse()
es_data_bulk = self.es_utils.extract_es_infos(
changed_methods=changed_methods
)
self.es_utils.insert_es_bulk(es_data_bulk)
"""Finish handling this commit, insert into the handled_commit index in es."""
es_data = {"repo_id": self.repoInfo.repo_id, "commit_sha": commit_sha}
self.es_utils.insert_es_item(
item=es_data,
index_name=self.config["elasticsearch"]["index_handled_commits"],
)
def handle_repositories(repositories_path: str, type: str, config: dict):
"""Handle all the repositories in the directory."""
es_utils = ESUtils(config=config)
es_utils.create_n_gram_index()
es_utils.create_handled_commit_index()
"""Handle repositories by multiple threads."""
workQueue = queue.Queue()
if type == "local":
repo_git_paths = [f.path for f in os.scandir(repositories_path) if f.is_dir()]
for repo_git_path in repo_git_paths:
workQueue.put(RepoInfo(repo_path=repo_git_path))
elif type == "gitea":
# iterate all the ownernames
ownername_paths = [
f.path for f in os.scandir(repositories_path) if f.is_dir()
]
for ownername_path in ownername_paths:
# iterate all the repositories
repo_git_paths = [
f.path for f in os.scandir(ownername_path) if f.is_dir()
]
for repo_git_path in repo_git_paths:
# if "test1.git" not in repo_git_path:
# continue # only for test
workQueue.put(RepoInfo(repo_path=repo_git_path))
else:
print("[Error]: illegal type")
sys.exit(1)
THREADNUM = config["coldstart_service"]["THREADNUM"]
threads = []
for i in range(THREADNUM):
t = HandleRepoThread(
name="Thread-" + str(i + 1),
q=workQueue,
config=config,
type=type,
)
t.start()
threads.append(t)
for t in threads:
t.join()
def main():
config_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "config-java.yml"
)
config = read_config(config_path)
if config is None:
print(
"[Error]: configuration file {config_path} not found".format(
config_path=config_path
)
)
sys.exit(1)
# cold start check local data firstly
try:
local_repositories_path = config["local"]["repositories_path"]
type = 'local'
handle_repositories(repositories_path=local_repositories_path, type=type, config=config)
except KeyError:
try:
gitea_repositories_path = config["gitea"]["repositories_path"]
type = 'gitea'
handle_repositories(repositories_path=gitea_repositories_path, type=type, config=config)
except KeyError:
print("[Error]: local and gitea repositories_path configration not found")
sys.exit(1)
if __name__ == "__main__":
main()
print("Finish ColdStartPerception service")

View File

@ -0,0 +1,304 @@
# aim: The ColdStartPerception service for OSCH
# author: zhangxunhui
# date: 2022-04-23
import os
import queue
import sys
import threading
from typing import List
import hashlib
from ChangedMethodExtractor import ChangedMethodExtractor
from dulwich.objects import Blob, Commit, Tag, Tree
from dulwich.repo import Repo
from dulwich.walk import WalkEntry
from ESUtils import ESUtils
from models.RepoInfo import RepoInfo
from MySQLUtils import MySQLUtils
from services.utils import read_config
class HandleRepository(object):
def __init__(self, repoInfo: RepoInfo, config: dict, es_utils: ESUtils, type: str):
self.config = config
self.repoInfo = repoInfo
self.repo = Repo(self.repoInfo.repo_path)
self.type = type
if self.type == "gitea":
self.repoInfo.ownername = self.repo.path.split("/")[-2]
self.repoInfo.reponame = self.repo.path.split("/")[-1].split(".")[0]
self.mysql_utils = MySQLUtils(
host=self.config["mysql"]["host"],
port=self.config["mysql"]["port"],
username=self.config["mysql"]["username"],
password=self.config["mysql"]["password"],
database=self.config["mysql"]["database"],
autocommit=False,
dictcursor=True,
)
repo_id = self.mysql_utils.get_repo_id(
self.repoInfo.ownername, self.repoInfo.reponame
)
if repo_id is None:
raise Exception(
"HandleRepository Error: cannot find the id of repository: {repository_path}".format(
repository_path=self.repository_path
)
)
else:
self.repoInfo.repo_id = repo_id["id"]
elif self.type == "local":
self.repoInfo.repo_id = self.hash_path_to_id(self.repoInfo.repo_path)
self.es_utils = es_utils
self.handled_commits = self.es_utils.get_handled_commits(
repo_id=self.repoInfo.repo_id,
index_name=self.config["elasticsearch"]["index_handled_commits"],
)
# local repo id
def hash_path_to_id(self, path):
hash_hex = hashlib.sha256(path.encode()).hexdigest()
return hash_hex
def run(self):
"""Get all the commits."""
print(
"[Info]: Handling repository {repo_path}".format(
repo_path=self.repo.path
)
)
commits: List[Commit] = []
object_store = self.repo.object_store
object_shas = list(iter(object_store))
for object_sha in object_shas:
obj = object_store[object_sha]
if (
isinstance(obj, Tag)
or isinstance(obj, Blob)
or isinstance(obj, Tree)
):
pass
elif isinstance(obj, Commit):
commits.append(obj)
else:
raise Exception("HandleRepository.run Error: unknown type!")
if self.type == 'gitea':
"""Whether this repository is forked or original"""
info = self.mysql_utils.get_repo_info(repo_id=self.repoInfo.repo_id)
is_fork = False
if info is not None and info["is_fork"] == 1:
is_fork = True
if is_fork:
# eliminate the forked commits
fork_id = info["fork_id"]
origin_info = self.mysql_utils.get_repo_info(repo_id=fork_id)
if origin_info is not None:
origin_ownername = origin_info["owner_name"]
origin_reponame = origin_info["name"]
origin_repo_path = os.path.join(
self.config["gitea"]["repositories_path"],
origin_ownername,
origin_reponame + ".git",
)
origin_repo = Repo(origin_repo_path)
origin_commits: List[Commit] = []
origin_object_store = origin_repo.object_store
origin_object_shas = list(iter(origin_object_store))
for object_sha in origin_object_shas:
obj = origin_object_store[object_sha]
if isinstance(obj, Commit):
origin_commits.append(obj)
commits = list(set(commits) - set(origin_commits))
else:
pass # origin repo not found in gitea mysql database
else:
pass # not a fork repo
else:
pass # local repo dont forked
"""Handle each commit."""
for commit in commits:
if commit.id.decode() not in self.handled_commits:
HandleCommit(
repo=self.repo,
repoInfo=self.repoInfo,
commit=commit,
config=self.config,
es_utils=self.es_utils,
).run()
else:
continue
class HandleRepoThread(threading.Thread):
def __init__(
self,
name: str,
q: queue.Queue,
config: dict,
type: str,
):
threading.Thread.__init__(self)
self.name = name
self.q = q
self.config = config
self.es_utils = ESUtils(config=self.config)
self.type = type
def run(self):
print("[Info]: Start thread: " + self.name)
while not self.q.empty():
repoInfo = self.q.get()
# handle one repository(local or gitea)
handler = HandleRepository(
repoInfo=repoInfo,
config=self.config,
es_utils=self.es_utils,
type = self.type
)
handler.run()
self.q.task_done()
print("[Info]: Exist thread: " + self.name)
class HandleCommit(object):
def __init__(
self,
repo: Repo,
repoInfo: RepoInfo,
commit: Commit,
config: dict,
es_utils: ESUtils,
):
self.repo = repo
self.repoInfo = repoInfo
self.commit = commit
self.config = config
self.es_utils = es_utils
def run(self):
commit_sha = self.commit.id.decode()
print(
"[Info]: Handling commit {commit_sha}".format(
commit_sha=commit_sha
)
)
"""Generate all the changes for this commit."""
walk_entry = WalkEntry(
self.repo.get_walker(include=[self.commit.id]), self.commit
)
t_changes = walk_entry.changes() # get all the TreeChange objects
if len(self.commit.parents) > 1:
t_changes = [item for t_cs in t_changes for item in t_cs]
changed_methods = ChangedMethodExtractor(
repo=self.repo,
repoInfo=self.repoInfo,
commit=self.commit,
t_changes=t_changes,
config=self.config,
).parse()
es_data_bulk = self.es_utils.extract_es_infos(
changed_methods=changed_methods
)
self.es_utils.insert_es_bulk(es_data_bulk)
"""Finish handling this commit, insert into the handled_commit index in es."""
es_data = {"repo_id": self.repoInfo.repo_id, "commit_sha": commit_sha}
self.es_utils.insert_es_item(
item=es_data,
index_name=self.config["elasticsearch"]["index_handled_commits"],
)
def handle_repositories(repositories_path: str, type: str, config: dict):
"""Handle all the repositories in the directory."""
es_utils = ESUtils(config=config)
es_utils.create_n_gram_index()
es_utils.create_handled_commit_index()
"""Handle repositories by multiple threads."""
workQueue = queue.Queue()
if type == "local":
repo_git_paths = [f.path for f in os.scandir(repositories_path) if f.is_dir()]
for repo_git_path in repo_git_paths:
workQueue.put(RepoInfo(repo_path=repo_git_path))
elif type == "gitea":
# iterate all the ownernames
ownername_paths = [
f.path for f in os.scandir(repositories_path) if f.is_dir()
]
for ownername_path in ownername_paths:
# iterate all the repositories
repo_git_paths = [
f.path for f in os.scandir(ownername_path) if f.is_dir()
]
for repo_git_path in repo_git_paths:
# if "test1.git" not in repo_git_path:
# continue # only for test
workQueue.put(RepoInfo(repo_path=repo_git_path))
else:
print("[Error]: illegal type")
sys.exit(1)
THREADNUM = config["coldstart_service"]["THREADNUM"]
threads = []
for i in range(THREADNUM):
t = HandleRepoThread(
name="Thread-" + str(i + 1),
q=workQueue,
config=config,
type=type,
)
t.start()
threads.append(t)
for t in threads:
t.join()
def main():
config_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "config-python.yml"
)
config = read_config(config_path)
if config is None:
print(
"[Error]: configuration file {config_path} not found".format(
config_path=config_path
)
)
sys.exit(1)
# cold start check local data firstly
try:
local_repositories_path = config["local"]["repositories_path"]
type = 'local'
handle_repositories(repositories_path=local_repositories_path, type=type, config=config)
except KeyError:
try:
gitea_repositories_path = config["gitea"]["repositories_path"]
type = 'gitea'
handle_repositories(repositories_path=gitea_repositories_path, type=type, config=config)
except KeyError:
print("[Error]: local and gitea repositories_path configration not found")
sys.exit(1)
if __name__ == "__main__":
main()
print("Finish ColdStartPerception service")

View File

@ -0,0 +1,114 @@
Traceback (most recent call last):
File "/home/pdlzxh/OSCH/services/ColdStartPerception.py", line 289, in main
local_repositories_path = config["local"]["repositories_path"]
~~~~~~^^^^^^^^^
KeyError: 'local'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/connection.py", line 174, in _new_conn
conn = connection.create_connection(
(self._dns_host, self.port), self.timeout, **extra_kw
)
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/util/connection.py", line 95, in create_connection
raise err
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/util/connection.py", line 85, in create_connection
sock.connect(sa)
~~~~~~~~~~~~^^^^
ConnectionRefusedError: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/elasticsearch/connection/http_urllib3.py", line 251, in perform_request
response = self.pool.urlopen(
method, url, body, retries=Retry(False), headers=request_headers, **kw
)
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/connectionpool.py", line 799, in urlopen
retries = retries.increment(
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
)
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/util/retry.py", line 525, in increment
raise six.reraise(type(error), error, _stacktrace)
~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/packages/six.py", line 770, in reraise
raise value
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/connectionpool.py", line 715, in urlopen
httplib_response = self._make_request(
conn,
...<5 lines>...
chunked=chunked,
)
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/connectionpool.py", line 416, in _make_request
conn.request(method, url, **httplib_request_kw)
~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/connection.py", line 244, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/http/client.py", line 1336, in request
self._send_request(method, url, body, headers, encode_chunked)
~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/http/client.py", line 1382, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/http/client.py", line 1331, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/http/client.py", line 1091, in _send_output
self.send(msg)
~~~~~~~~~^^^^^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/http/client.py", line 1035, in send
self.connect()
~~~~~~~~~~~~^^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/connection.py", line 205, in connect
conn = self._new_conn()
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/urllib3/connection.py", line 186, in _new_conn
raise NewConnectionError(
self, "Failed to establish a new connection: %s" % e
)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x7f05d79582b0>: Failed to establish a new connection: [Errno 111] Connection refused
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/pdlzxh/OSCH/services/ColdStartPerception.py", line 303, in <module>
main()
~~~~^^
File "/home/pdlzxh/OSCH/services/ColdStartPerception.py", line 296, in main
handle_repositories(repositories_path=gitea_repositories_path, type=type, config=config)
~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/pdlzxh/OSCH/services/ColdStartPerception.py", line 226, in handle_repositories
es_utils = ESUtils(config=config)
File "/home/pdlzxh/OSCH/services/ESUtils.py", line 19, in __init__
self.client = self.connect()
~~~~~~~~~~~~^^
File "/home/pdlzxh/OSCH/services/ESUtils.py", line 25, in connect
response = client.cluster.health()
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/elasticsearch/client/utils.py", line 168, in _wrapped
return func(*args, params=params, headers=headers, **kwargs)
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/elasticsearch/client/cluster.py", line 66, in health
return self.transport.perform_request(
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
"GET",
^^^^^^
...<2 lines>...
headers=headers,
^^^^^^^^^^^^^^^^
)
^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/elasticsearch/transport.py", line 413, in perform_request
raise e
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/elasticsearch/transport.py", line 381, in perform_request
status, headers_response, data = connection.perform_request(
~~~~~~~~~~~~~~~~~~~~~~~~~~^
method,
^^^^^^^
...<5 lines>...
timeout=timeout,
^^^^^^^^^^^^^^^^
)
^
File "/home/pdlzxh/anaconda3/envs/OSCH/lib/python3.13/site-packages/elasticsearch/connection/http_urllib3.py", line 266, in perform_request
raise ConnectionError("N/A", str(e), e)
elasticsearch.exceptions.ConnectionError: ConnectionError(<urllib3.connection.HTTPConnection object at 0x7f05d79582b0>: Failed to establish a new connection: [Errno 111] Connection refused) caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7f05d79582b0>: Failed to establish a new connection: [Errno 111] Connection refused)

View File

@ -125,13 +125,16 @@ class HandleRepository(object):
"""Handle each commit."""
for commit in commits:
HandleCommit(
repo=self.repo,
repoInfo=self.repoInfo,
commit=commit,
config=self.config,
es_utils=self.es_utils,
).run()
if commit.id.decode() not in self.handled_commits:
HandleCommit(
repo=self.repo,
repoInfo=self.repoInfo,
commit=commit,
config=self.config,
es_utils=self.es_utils,
).run()
else:
continue
class HandleRepoThread(threading.Thread):

View File

@ -0,0 +1,7 @@
* Serving Flask app 'IncrementalPerceptionAPI'
* Debug mode: off
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
* Running on all addresses (0.0.0.0)
* Running on http://127.0.0.1:5001
* Running on http://10.29.128.27:5001
Press CTRL+C to quit

52
services/config-cpp.yml Normal file
View File

@ -0,0 +1,52 @@
# gitea:
# repositories_path: "Your project root path on your local machine/dependencies/gitea/git/repositories"
local:
repositories_path: "/home/common_data/cpp_repo"
# repositories_path: "/home/pdlzxh/OSCH/test_repo"
elasticsearch:
urls:
- "http://localhost:9200"
username: "elastic"
password: "ATs_4A7nPIHiK7=pz3T_"
index_ngram: "handled_n_grams"
index_handled_commits: "handled_commits"
mysql:
host: "127.0.0.1"
port: 3307
username: "gitea"
password: "gitea"
database: "gitea"
rabbitmq:
host: "127.0.0.1"
port: 5672
service:
lang_suffix:
- "java"
- "py"
- "cpp"
mit: 50
mil: 6
ngram: 5
filter_threshold: 10
verify_threshold: 70
coldstart_service:
THREADNUM: 6
incremental_service:
THREADNUM: 2
nil:
basepath: "/home/pdlzxh/OSCH/services/parser/nil"
java: "Func-extractor-java.jar"
py: "Func-extractor-py.jar"
cpp: "Func-extractor-cpp.jar"
client_service:
gitea_url: "Your address for gitea service"
token: "Your personal account token who installed the webhook client service"

52
services/config-java.yml Normal file
View File

@ -0,0 +1,52 @@
# gitea:
# repositories_path: "Your project root path on your local machine/dependencies/gitea/git/repositories"
local:
repositories_path: "/home/common_data/java_repo"
# repositories_path: "/home/pdlzxh/OSCH/test_repo"
elasticsearch:
urls:
- "http://localhost:9200"
username: "elastic"
password: "ATs_4A7nPIHiK7=pz3T_"
index_ngram: "handled_n_grams"
index_handled_commits: "handled_commits"
mysql:
host: "127.0.0.1"
port: 3307
username: "gitea"
password: "gitea"
database: "gitea"
rabbitmq:
host: "127.0.0.1"
port: 5672
service:
lang_suffix:
- "java"
- "py"
- "cpp"
mit: 50
mil: 6
ngram: 5
filter_threshold: 10
verify_threshold: 70
coldstart_service:
THREADNUM: 12
incremental_service:
THREADNUM: 2
nil:
basepath: "/home/pdlzxh/OSCH/services/parser/nil"
java: "Func-extractor-java.jar"
py: "Func-extractor-py.jar"
cpp: "Func-extractor-cpp.jar"
client_service:
gitea_url: "Your address for gitea service"
token: "Your personal account token who installed the webhook client service"

View File

@ -0,0 +1,52 @@
# gitea:
# repositories_path: "Your project root path on your local machine/dependencies/gitea/git/repositories"
local:
repositories_path: "/home/common_data/python_repo"
# repositories_path: "/home/pdlzxh/OSCH/test_repo"
elasticsearch:
urls:
- "http://localhost:9200"
username: "elastic"
password: "ATs_4A7nPIHiK7=pz3T_"
index_ngram: "handled_n_grams"
index_handled_commits: "handled_commits"
mysql:
host: "127.0.0.1"
port: 3307
username: "gitea"
password: "gitea"
database: "gitea"
rabbitmq:
host: "127.0.0.1"
port: 5672
service:
lang_suffix:
- "java"
- "py"
- "cpp"
mit: 50
mil: 6
ngram: 5
filter_threshold: 10
verify_threshold: 70
coldstart_service:
THREADNUM: 6
incremental_service:
THREADNUM: 2
nil:
basepath: "/home/pdlzxh/OSCH/services/parser/nil"
java: "Func-extractor-java.jar"
py: "Func-extractor-py.jar"
cpp: "Func-extractor-cpp.jar"
client_service:
gitea_url: "Your address for gitea service"
token: "Your personal account token who installed the webhook client service"

View File

@ -1,14 +1,11 @@
# gitea:
# repositories_path: "Your project root path on your local machine/dependencies/gitea/git/repositories"
local:
repositories_path: "/home/common_data/java_repo"
gitea:
repositories_path: "home/pdlzxh/OSCH/dependencies/gitea/git/repositories"
elasticsearch:
urls:
- "http://localhost:9200"
username: "elastic"
password: "ATs_4A7nPIHiK7=pz3T_"
- "http://localhost:19200"
username: ""
password: ""
index_ngram: "handled_n_grams"
index_handled_commits: "handled_commits"
@ -47,5 +44,5 @@ nil:
cpp: "Func-extractor-cpp.jar"
client_service:
gitea_url: "Your address for gitea service"
token: "Your personal account token who installed the webhook client service"
gitea_url: "http://10.0.10.27:3000"
token: "746865cd309495902cbb6052b788b2043ad03be0"

View File

@ -2,6 +2,7 @@ import os
import re
import shutil
import subprocess
from datetime import datetime
from dulwich.objects import Commit
from models.MethodInfo import MethodInfo
@ -43,7 +44,7 @@ class FuncExtractor:
self.config["nil"][lang],
),
"-rp",
self.repoInfo.repo_path + '/.git',
self.repoInfo.repo_path,
"-os",
self.object_sha,
"-mit",
@ -71,7 +72,8 @@ class FuncExtractor:
tokens=tokens,
)
except Exception:
print("-rp "+self.repoInfo.repo_path+" -os "+self.object_sha+" Finished!")
now = now = datetime.now()
print(now.strftime("%Y-%m-%d %H:%M:%S") + "-rp "+self.repoInfo.repo_path+" -os "+self.object_sha+" Finished!")
return
def formMethodInfo(self, start_line: int, end_line: int, tokens: list):

View File

@ -3,7 +3,7 @@
restart_service()
{
service_name=$1
python_path="/home/zxh/anaconda3/envs/OSCH/bin/python" # This should be changed to your local python path
python_path="/home/pdlzxh/anaconda3/envs/OSCH/bin/python" # This should be changed to your local python path
echo "-----------------------------"
echo "Restarting service: $service_name..."
script_name="$service_name.py"