From 10b896c111845f47ed490b3f014f7e6b832d16da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fays=20J=C3=A9r=C3=A9mie?= <j.fays@uliege.be>
Date: Mon, 27 May 2019 15:19:25 +0200
Subject: [PATCH] Upload New File

---
 Simian-filtered.py | 64 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 Simian-filtered.py

diff --git a/Simian-filtered.py b/Simian-filtered.py
new file mode 100644
index 0000000..2395b80
--- /dev/null
+++ b/Simian-filtered.py
@@ -0,0 +1,64 @@
+# Script analysing results from Simian (http://www.harukizaemon.com/simian/) xml file output 
+# in order to detect plagiarism between two projects. It will filter Simian results and remove 
+# all duplicated blocks that concern only one project.
+#
+# File structure needs to be the following :
+# 
+#  simianBaseDir
+# 		bin/
+#			simianApp
+#		CODE/
+#			(this script)
+#			projectName/
+#				project1/ (contains source code of project 1)
+#				project2/ (contains source code of project 2)
+#				... (could contain other projects)
+#
+# ----------------------------------------------------------------------
+# Result : writes a file named projectName+'-simian-filtered.xml' in the CODE/ directory
+# This file contains only duplicated code from one project to another (all duplicated code within a project is removed)
+# 
+
+import subprocess
+
+
+simianBaseDir='/simianDir'
+simianApp="simian-2.4.0.jar"
+projectName='projectNameDir'
+callArgument='java -jar '+simianBaseDir+'/bin/'+simianApp+' -formatter=xml:'+projectName+'-simian.xml "'+simianBaseDir+'/CODE/Predetector/**/*.*"'
+#print callArgument
+
+retcode = subprocess.Popen(callArgument, shell=True)
+
+
+#function that returns the main subdirectory from the 'projectName' directory
+def getSubProject(filePath):
+	subProjectPath = filePath.split(projectName, 1)
+	subProjectName = subProjectPath[1].split('/',2)
+	return subProjectName[1]
+
+
+import xml.etree.ElementTree as ET
+tree = ET.parse(simianBaseDir+"/CODE/"+projectName+'-simian.xml')
+root = tree.getroot()
+for set in root[0].findall('set'):
+	duplicateCount=0
+	#stores the first subproject name from the set, in order to compare with others
+	sourceFile=set[0].get('sourceFile')		
+	firstDir=getSubProject(sourceFile)
+	
+	for block in set.iter('block'):
+		if getSubProject(block.get('sourceFile')) != firstDir:
+			duplicateCount=duplicateCount+1		
+	if not duplicateCount:
+		root[0].remove(set)
+
+tree.write(simianBaseDir+"/CODE/"+projectName+'-simian-filtered.xml')
+
+
+
+
+
+
+
+
-- 
GitLab