summaryrefslogtreecommitdiff
path: root/tools/dup.scm
blob: 217dc8ca5291ed7469c94524f41af3abc45de538 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
;;; dup.scm
;;; (dups size file alloc-lines): 
;;;    find all matches of "size" successive lines in "file" ignoring empty lines and leading/trailing whitespace
;;;    "alloc-lines" is any number bigger than the number of lines in "file"
;;;    (dups 16 "s7.c" 91000) finds all 16-line matches in s7.c which (we wish) has less than 91000 lines in all

;(set! (*s7* 'heap-size) (* 2 1024000))

(define dups 
  (let ((unique #f))

    (define-constant (all-positive? start len)
      (do ((j (+ start len) (- j 1)))
	  ((or (vector-ref unique j)
	       (= j start))
	   j)))

    (lambda (size file alloc-lines)
      (let ((lines (make-vector alloc-lines ""))
	    (original-lines (make-vector alloc-lines ""))
	    (lens (make-int-vector alloc-lines))
	    (linenums (make-int-vector alloc-lines))
	    (size-1 0))
	(set! unique (make-vector alloc-lines #f))
	
	(call-with-input-file file
	  (lambda (p)
	    ;; get lines, original and trimmed
	    (let ((total-lines 
		   (do ((i 0 (+ i 1))
			(j 0)
			(line (read-line p) (read-line p)))
		       ((eq? line #<eof>) j)
		     ;; save original lines
		     (vector-set! original-lines i line)
		     (let ((len (length line)))
		       (when (> len 0)
			 ;; trim leading whitespace
			 (do ((k 0 (+ k 1)))
			     ((or (= k len)
				  (not (char-whitespace? (string-ref line k))))
			      (when (> k 0)
				(set! line (substring line k))
				(set! len (- len k)))))
			 ;; trim trailing whitespace
			 (when (> len 0)
			   (do ((j (- len 1) (- j 1)))
			       ((or (< j 0)
				    (not (char-whitespace? (string-ref line j))))
				(unless (= j (- len 1))
				  (set! line (substring line 0 (+ j 1)))
				  (set! len (+ j 1)))))
			   (when (> len 0)
			     (int-vector-set! linenums j i)
			     (vector-set! lines j line)
			     (int-vector-set! lens j (length line))
			     (set! j (+ j 1)))))))))
	      
	      (set! size (min size total-lines))
	      (set! size-1 (- size 1))
	      ;; (format *stderr* "lines: ~S~%" total-lines)         ; 84201 2-jul-19

	      ;; mark unmatchable strings
	      (let ((sortv (make-vector total-lines)))
		(do ((i 0 (+ i 1)))
		    ((= i total-lines))
		  (vector-set! sortv i (cons (vector-ref lines i) i)))
		(set! sortv (sort! sortv (lambda (a b)
					   (string<? (car a) (car b)))))
		(let ((unctr -1)
		      (matches #f)
		      (current (vector-ref sortv 0)))
		  (for-each (lambda (srt)
			      (if (string=? (car current) (car srt))
				  (set! matches #t)
				  (begin
				    (unless matches
				      (int-vector-set! lens (cdr current) unctr)
				      (vector-set! unique (cdr current) #t) ; unique = (negative? (lens...))
				      (set! unctr (- unctr 1)))
				    (set! matches #f)
				    (set! current srt))))
			    sortv)
		  ;; (format *stderr* "unmatched: ~D~%" (abs unctr)) ; 33796
		  ))

	      ;; look for matches
	      (do ((first #t #t)
		   (last-line (- total-lines size))
		   (i 0 (+ i 1)))
		  ((>= i last-line)) ; >= because i is set below
		(let ((j (all-positive? i size-1)))   ; is a match possible?
		  (if (not (= j i))
		      (set! i j)
		      (let ((lenseq (subvector lens size i))
			    (lineseq (subvector lines size i)))
			(do ((k (+ i 1) (+ k 1)))
			    ((>= k last-line))
			  (let ((jk (all-positive? k size-1)))
			    (if (not (= jk k))
				(set! k jk)
				(when (and (equal? lenseq (subvector lens size k))
					   (equal? lineseq (subvector lines size k)))
				  (let ((full-size size))
				    (do ((nk (+ k size) (+ nk 1))
					 (ni (+ i size) (+ ni 1)))
					((or (= nk total-lines)
					     (not (= (int-vector-ref lens ni) (int-vector-ref lens nk)))
					     (not (string=? (vector-ref lines ni) (vector-ref lines nk))))
					 (set! full-size (- nk k))))
				    (if first
					(let ((first-line (int-vector-ref linenums i)))
					  (format *stderr* "~NC~%~{~A~%~}~%  lines ~D ~D" 8 #\- ; lineseq 
						  (subvector original-lines (- (int-vector-ref linenums (+ i size)) first-line) first-line)
						  first-line
						  (int-vector-ref linenums k))
					  (set! first #f))
					(format *stderr* " ~D" (int-vector-ref linenums k)))
				    (set! i (+ i full-size))
				    (when (< size full-size)
				      (format *stderr* "[~D]" full-size)))))))
			(unless first
			  (format *stderr* "~%")))))))))))))

(dups 16 "s7.c" 100000)
;(dups 8 "s7.c" 100000)
;(dups 12 "ffitest.c" 2000)
;(dups 8 "ffitest.c" 2000)
;(dups 1 "s7test.scm" 105000)

(exit)