001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2014  Oliver Burn
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019package com.puppycrawl.tools.checkstyle.checks.duplicates;
020
021import com.google.common.collect.ArrayListMultimap;
022import com.google.common.collect.Lists;
023import com.google.common.collect.MapMaker;
024import com.google.common.collect.Multimap;
025import com.puppycrawl.tools.checkstyle.api.AbstractFileSetCheck;
026import com.puppycrawl.tools.checkstyle.api.FileText;
027import com.puppycrawl.tools.checkstyle.api.MessageDispatcher;
028import com.puppycrawl.tools.checkstyle.api.Utils;
029import java.io.File;
030import java.io.IOException;
031import java.util.Collection;
032import java.util.List;
033import java.util.Map;
034import org.apache.commons.logging.Log;
035import org.apache.commons.logging.LogFactory;
036
037/**
038 * Performs a line-by-line comparison of all code lines and reports
039 * duplicate code if a sequence of lines differs only in
040 * indentation.  All import statements in Java code are ignored, any
041 * other line - including javadoc, whitespace lines between methods,
042 * etc. - is considered (which is why the check is called
043 * <em>strict</em>).
044 *
045 * @author Lars K&uuml;hne
046 */
047public final class StrictDuplicateCodeCheck extends AbstractFileSetCheck
048{
049    /**
050     * A prime that is used o calculate checksums of lines and blocks of lines.
051     * Important that it's larger than the length of most lines to avoid hash
052     * collisions.
053     *
054     * For a list of primes see
055     * http://www.utm.edu/research/primes/lists/small/1000.txt
056     */
057    private static final int BIG_PRIME = 317;
058
059    /**
060     * Converts each consecutive block of {@link #mMin} lines of the
061     * original source lines to a checksum that is checked against
062     * to find duplicates.
063     */
064    private interface ChecksumGenerator
065    {
066        /**
067         * Convert each block of the original source lines
068         * to a checksum that is checked against to find duplicates
069         *
070         * @param aOriginalLines the original lines as they appear
071         * in the source file
072         *
073         * @return an array of (aOriginalLines.length - mMin + 1) checksums
074         */
075        int[] convertLines(String[] aOriginalLines);
076    }
077
078
079    /**
080     * Calculates checksums for text files.
081     */
082    private class TextfileChecksumGenerator implements ChecksumGenerator
083    {
084        /** {@inheritDoc} */
085        public int[] convertLines(String[] aOriginalLines)
086        {
087            final int lineCount = aOriginalLines.length;
088            final long[] checkSums = new long[lineCount];
089            for (int i = 0; i < lineCount; i++) {
090                final String line = aOriginalLines[i];
091                checkSums[i] = calcChecksum(line);
092            }
093            final int retLen = Math.max(0, lineCount - mMin + 1);
094            final int[] ret = new int[retLen];
095
096            for (int i = 0; i < retLen; i++) {
097                int blockChecksum = 0;
098                boolean onlyEmptyLines = true;
099                for (int j = 0; j < mMin; j++) {
100                    if (aOriginalLines[i + j].length() > 0) {
101                        onlyEmptyLines = false;
102                    }
103                    final long checksum = checkSums[i + j];
104                    if (checksum == IGNORE) {
105                        blockChecksum = IGNORE;
106                        break;
107                    }
108                    blockChecksum += (j + 1) * BIG_PRIME * checksum;
109                }
110                ret[i] = onlyEmptyLines ? IGNORE : blockChecksum;
111            }
112            return ret;
113        }
114
115        /**
116         * Computes a checksum for a a single line of text.
117         * @param aLine the aLine
118         * @return checksum
119         */
120        protected int calcChecksum(String aLine)
121        {
122            final int hashCode = aLine.hashCode();
123            if (hashCode == IGNORE) {
124                return Integer.MAX_VALUE / 2;
125            }
126            return hashCode;
127        }
128    }
129
130    /**
131     * A TextfileChecksumGenerator that also ignores imports.
132     */
133    private class JavaChecksumGenerator extends TextfileChecksumGenerator
134    {
135        // TODO: return IGNORE for lines in the header comment?
136        // That would require some simple parsing...
137
138        // we could also parse the java code using the TreeWalker
139        // and then ignore everything before the CLASS_DEF...
140
141        @Override
142        protected int calcChecksum(String aLine)
143        {
144            // to avoid false alarms it is important that different lines
145            // result in different checksums.
146            if (aLine.startsWith("import ")) {
147                return IGNORE;
148            }
149            return super.calcChecksum(aLine);
150        }
151    }
152
153    /** a jakarta commons log */
154    private static final Log LOG =
155            LogFactory.getLog(StrictDuplicateCodeCheck.class);
156
157    /** the checksum value to use for lines that should be ignored */
158    static final int IGNORE = Integer.MIN_VALUE;
159
160    /** default value for mMin */
161    private static final int DEFAULT_MIN_DUPLICATE_LINES = 12;
162
163    /** number of lines that have to be idential for reporting duplicates */
164    private int mMin = DEFAULT_MIN_DUPLICATE_LINES;
165
166    /** the basedir to strip off in filenames */
167    private String mBasedir;
168
169    /**
170     * The checksums of all files that are currently checked.
171     * Dimension one: file index
172     * Dimension two: block start line
173     */
174    private int[][] mLineBlockChecksums;
175
176    /**
177     * helper to speed up searching algorithm, holds the checksums from
178     * {@link #mLineBlockChecksums} except {@link #IGNORE}, sorted.
179     */
180    private ChecksumInfo[] mChecksumInfo;
181
182    /** files that are currently checked */
183    private final List<File> mFiles = Lists.newArrayList();
184
185    /**
186     * A SoftReference cache for the trimmed lines of a file path.
187     */
188    private final Map<String, String[]> mTrimmedLineCache =
189        new MapMaker().softValues().makeMap();
190
191    // fields required only for statistics
192
193    /** total number of duplicates found */
194    private int mDuplicates;
195    /** the charset used to load files. */
196    private String mCharset;
197
198    /** Creates a new instance of this class. */
199    public StrictDuplicateCodeCheck()
200    {
201    }
202
203    /**
204     * Sets the minimum number of lines that must be equivalent
205     * before the check complains.
206     *
207     * @param aMin the number of lines that must be equal before
208     * triggering a 'duplicate code' message.
209     */
210    public void setMin(int aMin)
211    {
212        if (aMin < 1) {
213            throw new IllegalArgumentException("min must be 1 or higher");
214        }
215        mMin = aMin;
216    }
217
218    /** @param aBasedir the base directory to strip off in filenames */
219    public void setBasedir(String aBasedir)
220    {
221        mBasedir = aBasedir;
222    }
223
224    @Override
225    public void beginProcessing(String aCharset)
226    {
227        super.beginProcessing(aCharset);
228        mCharset = aCharset;
229        mFiles.clear();
230    }
231
232    @Override
233    protected void processFiltered(File aFile, List<String> aLines)
234    {
235        mFiles.add(aFile);
236    }
237
238    @Override
239    public void finishProcessing()
240    {
241        super.finishProcessing();
242        final long start = System.currentTimeMillis();
243        mDuplicates = 0;
244        mLineBlockChecksums = new int[mFiles.size()][];
245        mChecksumInfo = new ChecksumInfo[mFiles.size()];
246
247        if (LOG.isDebugEnabled()) {
248            LOG.debug("Reading " + mFiles.size() + " input files");
249        }
250
251        for (int i = 0; i < mFiles.size(); i++) {
252            final File file = mFiles.get(i);
253            try {
254                final String[] lines = getTrimmedLines(file);
255                final ChecksumGenerator transformer =
256                    findChecksumGenerator(file);
257                mLineBlockChecksums[i] = transformer.convertLines(lines);
258            }
259            catch (final IOException ex) {
260                LOG.error("Cannot access " + file + " ("
261                          + ex.getMessage() + "), ignoring", ex);
262                // TODO: better to throw an exception here?
263                // it would be best to throw IOException from process(),
264                // but interface definition doesn't allow that...
265                mLineBlockChecksums = new int[0][0];
266            }
267        }
268        fillSortedRelevantChecksums();
269
270        final long endReading = System.currentTimeMillis();
271        findDuplicates();
272        final long endSearching = System.currentTimeMillis();
273
274        dumpStats(start, endReading, endSearching);
275
276        mLineBlockChecksums = null;
277        mChecksumInfo = null;
278    }
279
280    /**
281     * Finds the Checksum generator for a given file.
282     *
283     * @param aFile the file to check for duplicates
284     * @return a generator to use for aFile
285     */
286    private ChecksumGenerator findChecksumGenerator(File aFile)
287    {
288        if (aFile.getName().endsWith(".java")) {
289            return new JavaChecksumGenerator();
290        }
291        // TODO: Not sure what to do with binary files such as gifs
292        return new TextfileChecksumGenerator();
293    }
294
295    /**
296     * Dump out statistics data on stderr.
297     * @param aStart start time
298     * @param aEndReading end time of reading phsical files
299     * @param aEndSearching end time duplicate analysis
300     */
301    private void dumpStats(long aStart, long aEndReading, long aEndSearching)
302    {
303        if (LOG.isDebugEnabled()) {
304            final long initTime = aEndReading - aStart;
305            final long workTime = aEndSearching - aEndReading;
306            LOG.debug("files = " + mFiles.size());
307            LOG.debug("duplicates = " + mDuplicates);
308            LOG.debug("Runtime = " + initTime + " + " + workTime);
309        }
310    }
311
312    /**
313     * filters and sorts the relevant lines and stores the result
314     * in sortedRelevantChecksums during the setup phase.
315     * That data is later used in a binary search to find out
316     * if it is worth investigating a file for duplicates of a block.
317     * If the block's checksum does not occur in the other file
318     * at all, we can skip that file quickly.
319     */
320    private void fillSortedRelevantChecksums()
321    {
322        for (int i = 0; i < mLineBlockChecksums.length; i++) {
323            final int[] checksums = mLineBlockChecksums[i];
324            mChecksumInfo[i] = new ChecksumInfo(checksums);
325        }
326    }
327
328    /**
329     * finds duplicate lines in mFiles,
330     * using a textsearch algorithm to find reoccuring
331     * patters in the lineChecksums.
332     */
333    private void findDuplicates()
334    {
335        if (LOG.isDebugEnabled()) {
336            LOG.debug("Analysis phase");
337        }
338
339        // It's been a while since my CS degree, but I think this is
340        // somewhere near O(LOC^2).
341
342        // It may be possible to do this *much* smarter,
343        // but I don't have the Knuth bible at hand right now :-)
344
345        final int len = mFiles.size();
346        for (int i = 0; i < len; i++) {
347
348            final String path = mFiles.get(i).getPath();
349            getMessageCollector().reset();
350            final MessageDispatcher dispatcher = getMessageDispatcher();
351            dispatcher.fireFileStarted(path);
352
353            for (int j = 0; j <= i; j++) {
354                findDuplicatesInFiles(i, j);
355            }
356
357            fireErrors(path);
358            dispatcher.fireFileFinished(path);
359        }
360    }
361
362    /**
363     * Compare two files and search for duplicates.
364     * @param aI mLineChecksums index of the first file to compare
365     * @param aJ mLineChecksums index of the seconds file to compare
366     */
367    private void findDuplicatesInFiles(int aI, int aJ)
368    {
369        final ChecksumInfo iChecksumInfo = mChecksumInfo[aI];
370        final ChecksumInfo jChecksumInfo = mChecksumInfo[aJ];
371        if (!iChecksumInfo.hasChecksumOverlapsWith(jChecksumInfo)) {
372            return;
373        }
374
375        final int[] iLineBlockChecksums = mLineBlockChecksums[aI];
376        final int iBlockCount = iLineBlockChecksums.length;
377
378        // blocks of duplicate code might be longer than 'min'. We need to
379        // remember the line combinations where we must ignore identical blocks
380        // because we have already reported them for an earlier blockIdx.
381        final Multimap<Integer, Integer> ignorePairs =
382            ArrayListMultimap.create();
383
384        // go through all the blocks in iFile and
385        // check if the following mMin lines occur in jFile
386        for (int iLine = 0; iLine < iBlockCount; iLine++) {
387
388            final int iSum = iLineBlockChecksums[iLine];
389            final int[] jLines = jChecksumInfo.findLinesWithChecksum(iSum);
390            // detailed analysis only if the iLine block occurs in jFile at all
391            if (jLines.length > 0) {
392                findDuplicateFromLine(aI, aJ, iLine, jLines, ignorePairs);
393            }
394        }
395    }
396
397    /**
398     * Find and report a duplicate of the code starting from line aILine
399     * in file aI in the file aJ. The caller has already ensured that
400     * there are at least mMax duplicate lines, this method mainly analyzes
401     * how far the block of duplicates extends.
402     *
403     * @param aI index of file that contains the candidate code
404     * @param aJ index of file that is searched for a dup of the candidate
405     * @param aILine starting line of the candidate in aI
406     * @param aJLines lines in file aJ that have the same checksum as aILine
407     * @param aIgnore Bag from iLine to jLines, an entry indicates that
408     * this line i/j-combination has already been reported as part of another
409     * viloation
410     */
411    private void findDuplicateFromLine(
412        final int aI, final int aJ, final int aILine,
413        final int[] aJLines, final Multimap<Integer, Integer> aIgnore)
414    {
415        // Using something more advanced like Boyer-Moore might be a
416        // good idea...
417
418        final int[] iCheckSums = mLineBlockChecksums[aI];
419        final int[] jCheckSums = mLineBlockChecksums[aJ];
420        final long checkSum = iCheckSums[aILine];
421
422        for (int jLine : aJLines) {
423
424            if (aI == aJ && aILine >= jLine) {
425                continue;
426            }
427
428            if (jCheckSums[jLine] != checkSum) {
429                continue;
430            }
431
432            final Collection<Integer> ignoreEntries = aIgnore.get(aILine);
433            if (ignoreEntries != null && ignoreEntries.contains(jLine)) {
434                continue;
435            }
436
437            final int duplicateLines =
438                verifiyDuplicateLines(aI, aJ, aILine, jLine);
439            if (duplicateLines >= mMin) {
440                reportDuplicate(duplicateLines, aILine, mFiles.get(aJ), jLine);
441                final int extend = duplicateLines - mMin;
442                for (int i = 0; i < extend; i++) {
443                    final int offset = (i + 1);
444                    aIgnore.put(aILine + offset, jLine + offset);
445                }
446            }
447        }
448    }
449
450    /**
451     * Verifies the number of lines that are equal.
452     * Note that block checksums might be equal for blocks that in fact
453     * are different, so we must check the actual file content again.
454     *
455     * @param aI file index i
456     * @param aJ file index j
457     * @param aIStartLine start line of potential duplicate code in file i
458     * @param aJStartLine start line of potential duplicate code in file j
459     * @return the number of verified equal lines
460     */
461    private int verifiyDuplicateLines(
462        int aI, int aJ, int aIStartLine, int aJStartLine)
463    {
464        final File iFile = mFiles.get(aI);
465        final File jFile = mFiles.get(aJ);
466        try {
467            final String[] iLines = getTrimmedLines(iFile);
468            final String[] jLines = getTrimmedLines(jFile);
469
470            int verified = 0;
471            int i = aIStartLine;
472            int j = aJStartLine;
473            while (i < iLines.length && j < jLines.length
474                && iLines[i++].equals(jLines[j++]))
475            {
476                verified += 1;
477            }
478            return verified;
479        }
480        catch (IOException ex) {
481            LOG.error("Unable to verify potential duplicate for "
482                + iFile + " and " + jFile, ex);
483            return 0;
484        }
485    }
486
487
488    /**
489     * Returns the trimmed lines of a given file.
490     * Caches the results, so when memory is available
491     * we try to avoid reading the file repeatedly.
492     *
493     * @param aFile the file
494     * @return the lines in aFile after applying {@link String#trim()}
495     * @throws IOException if the file content cannot be read
496     */
497    private String[] getTrimmedLines(File aFile) throws IOException
498    {
499        final String path = aFile.getPath();
500        final String[] cachedLines = mTrimmedLineCache.get(path);
501        if (cachedLines != null) {
502            return cachedLines;
503        }
504        final String charset = mCharset;
505        final FileText text = new FileText(aFile, charset);
506        final String[] lines = getTrimmed(text.toLinesArray());
507        mTrimmedLineCache.put(path, lines);
508        return lines;
509    }
510
511    /**
512     * Applies {@link String#trim()} on each String in a given array.
513     * @param aLines the original Strings
514     * @return the converted Strings after applying {@link String#trim()}
515     */
516    private String[] getTrimmed(String[] aLines)
517    {
518        final String[] ret = new String[aLines.length];
519        for (int i = 0; i < ret.length; i++) {
520            ret[i] = aLines[i].trim();
521        }
522        return ret;
523    }
524
525    /**
526     * Dumps out a duplicate report.
527     * @param aEquivalent number of equivalent lines
528     * @param aILine location of duplicate code
529     * within file that is currently checked
530     * @param aJFile the other file that contains the duplicate
531     * @param aJLine location of duplicate code within aJFile
532     */
533    private void reportDuplicate(
534            int aEquivalent, int aILine, File aJFile, int aJLine)
535    {
536        final String fileName =
537                Utils.getStrippedFileName(mBasedir, aJFile.getPath());
538        log(aILine + 1, "duplicates.lines", aEquivalent, fileName, aJLine + 1);
539        mDuplicates += 1;
540    }
541
542}