File: removedoubles.cc

package info (click to toggle)
signalbackup-tools 20250313.1-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 3,752 kB
  • sloc: cpp: 47,042; sh: 477; ansic: 399; ruby: 19; makefile: 3
file content (207 lines) | stat: -rw-r--r-- 10,854 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/*
  Copyright (C) 2019-2024  Selwin van Dijk

  This file is part of signalbackup-tools.

  signalbackup-tools is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  signalbackup-tools is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with signalbackup-tools.  If not, see <https://www.gnu.org/licenses/>.
*/

#include "signalbackup.ih"

void SignalBackup::removeDoubles(long long int milliseconds)
{
  Logger::message(__FUNCTION__);

  //auto t1 = std::chrono::high_resolution_clock::now();
  SqliteDB::QueryResults threads;
  if (!d_database.exec("SELECT _id FROM thread ORDER BY _id ASC", &threads))
    return;

  // note: doing this per thread is not needed, but this gives some sense
  // of progress as the query can be quite slow
  long long int removed_total = 0;
  for (unsigned int i = 0; i < threads.rows(); ++i)
  {
    long long int tid = threads.valueAsInt(i, "_id");
    long long int removed_last = 0;

    //SqliteDB::QueryResults todelete;
    if (!d_database.exec("WITH messages_with_attachmentsize AS "
                         "("
                         "  SELECT " + d_mms_table + "._id, " + d_mms_recipient_id + ", thread_id, " + d_mms_date_sent + ", " + d_mms_type + ", body, IFNULL(COUNT(data_size), 0) AS numattachments, IFNULL(SUM(data_size), 0) AS totalfilesize FROM " + d_mms_table +
                         "    LEFT JOIN " + d_part_table + " ON message_id IS " + d_mms_table + "._id"
                         "    WHERE thread_id = ?"
                         "    GROUP BY " + d_mms_table + "._id, " + d_mms_recipient_id + ", thread_id, " + d_mms_date_sent + ", " + d_mms_type + ", body"
                         "), "
                         "candidates AS "
                         "("
                         "  SELECT M._id, M." + d_mms_recipient_id + ", M.thread_id, M." + d_mms_date_sent + ", M." + d_mms_type + ", M.body, M.numattachments, M.totalfilesize FROM"
                         "  ("
                         "    SELECT _id, " + d_mms_recipient_id + ", thread_id, " + d_mms_date_sent + ", " + d_mms_type + ", body, numattachments, totalfilesize FROM messages_with_attachmentsize"
                         "    GROUP BY " + d_mms_recipient_id + ", thread_id, " + d_mms_type + ", COALESCE(body, ''), numattachments, totalfilesize"
                         "    HAVING COUNT(*) > 1"
                         "  ) AS D"
                         "  JOIN messages_with_attachmentsize AS M ON"
                         "    COALESCE(M.body, '') = COALESCE(D.body, '') AND"
                         "    M.numattachments = D.numattachments AND"
                         "    M.totalfilesize = D.totalfilesize AND"
                         "    M." + d_mms_recipient_id + " = D." + d_mms_recipient_id + " AND"
                         "    M." + d_mms_type + " = D." + d_mms_type +
                         "  ORDER BY M._id"
                         "),"
                         "to_delete AS "
                         "("
                         "  SELECT _id FROM candidates C WHERE "
                         "  _id > "
                         "  ("
                         "    SELECT min(_id) FROM candidates WHERE "
                         "      COALESCE(body, '') = COALESCE(C.body, '') AND "
                         "      numattachments = C.numattachments AND"
                         "      totalfilesize = C.totalfilesize AND"
                         "      " + d_mms_recipient_id + " = C." + d_mms_recipient_id + " AND "
                         "      " + d_mms_type + " = C." + d_mms_type + " AND"
                         "      ABS(" + d_mms_date_sent + " - C." + d_mms_date_sent + ") <= ?"
                         "  )"
                         ") "
                         //" SELECT _id FROM to_delete", {tid, milliseconds}, &todelete))
                         "DELETE FROM " + d_mms_table + " WHERE _id IN to_delete", {tid, milliseconds}))
    {
      Logger::error("Failed to delete doubles");
      return;
    }

    removed_last = d_database.changed();
    removed_total += removed_last;
    Logger::message("Deleted ", (removed_last ? Logger::Control::BOLD : Logger::Control::NORMAL), removed_last, Logger::Control::NORMAL, " duplicate entries from thread ", tid, " (", i + 1, "/", threads.rows(), ")");
    //todelete.prettyPrint(false);
  }
  Logger::message("Removed ", (removed_total ? Logger::Control::BOLD : Logger::Control::NORMAL), removed_total, Logger::Control::NORMAL, " doubled messages from database");
  // auto t2 = std::chrono::high_resolution_clock::now();
  // auto ms_int = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
  // std::cout << " *** TIME: " << ms_int.count() << "ms\n";


  //cleanDatabaseByMessages();
  // cleandatabasebymessages is a complicated (risky?) function because it messes with recipients.
  // theoretically, removing doubled messages should not affect the recipient table
  // here we duplicate the relevant parts of cleanDatabaseByMessages()

  // remove dangling parts
  Logger::message("  Deleting attachment entries from '", d_part_table, "' not belonging to remaining mms entries");
  d_database.exec("DELETE FROM " + d_part_table + " WHERE " + d_part_mid + " NOT IN (SELECT DISTINCT _id FROM " + d_mms_table + ")");
  Logger::message("  Removed ", d_database.changed(), " ", d_part_table, "-entries.");

  // remove unused attachments
  Logger::message("  Deleting unused attachments...");
  SqliteDB::QueryResults results;
  d_database.exec("SELECT _id," +
                  (d_database.tableContainsColumn(d_part_table, "unique_id") ? "unique_id"s : "-1 AS unique_id"s) +
                  " FROM " + d_part_table, &results);
  int constexpr INVALID_ID = -10;
  for (auto it = d_attachments.begin(); it != d_attachments.end();)
  {
    bool found = false;
    for (unsigned int i = 0; i < results.rows(); ++i)
    {
      long long int rowid = INVALID_ID;
      if (results.valueHasType<long long int>(i, "_id"))
        rowid = results.getValueAs<long long int>(i, "_id");
      long long int uniqueid = INVALID_ID;
      if (results.valueHasType<long long int>(i, "unique_id"))
        uniqueid = results.getValueAs<long long int>(i, "unique_id");

      if (rowid != INVALID_ID && uniqueid != INVALID_ID &&
          it->first.first == static_cast<uint64_t>(rowid) && it->first.second == static_cast<int64_t>(uniqueid))
      {
        found = true;
        break;
      }
    }
    if (!found)
      it = d_attachments.erase(it);
    else
      ++it;
  }

  // remove unused group_receipts
  Logger::message("  Deleting group receipts entries from deleted messages...");
  d_database.exec("DELETE FROM group_receipts WHERE mms_id NOT IN (SELECT DISTINCT _id FROM " + d_mms_table + ")");
  Logger::message("  Removed ", d_database.changed(), " group_receipts-entries.");

  // remove unused mentions
  if (d_database.containsTable("mention"))
  {
    Logger::message_start("  Deleting entries from 'mention' not belonging to remaining mms entries");
    d_database.exec("DELETE FROM mention WHERE message_id NOT IN (SELECT DISTINCT _id FROM " + d_mms_table + ")");
    Logger::message_end(" (", d_database.changed(), ")");
  }

  // remove unused call details
  if (d_database.containsTable("call"))
  {
    Logger::message_start("  Deleting entries from 'call' not belonging to remaining message entries");
    d_database.exec("DELETE FROM call WHERE message_id NOT IN (SELECT DISTINCT _id FROM " + d_mms_table + ")");
    Logger::message_end(" (", d_database.changed(), ")");
  }

  // remove unreferencing reactions
  if (d_database.containsTable("reaction"))
  {
    if (d_database.containsTable("sms"))
    {
      Logger::message_start("  Deleting entries from 'reaction' not belonging to remaining sms entries");
      d_database.exec("DELETE FROM reaction WHERE is_mms IS NOT 1 AND message_id NOT IN (SELECT DISTINCT _id FROM sms)");
      Logger::message_end(" (", d_database.changed(), ")");
    }
    Logger::message_start("  Deleting entries from 'reaction' not belonging to remaining mms entries");
    d_database.exec("DELETE FROM reaction WHERE message_id NOT IN (SELECT DISTINCT _id FROM " + d_mms_table + ")" +
                    (d_database.tableContainsColumn("reaction", "is_mms") ? " AND is_mms IS 1" : ""));
    Logger::message_end(" (", d_database.changed(), ")");
  }

  // remove msl_message
  if (d_database.containsTable("msl_message"))
  {
    // note this function is generally called because messages (and/or attachments) have been deleted
    // the msl_payload table has triggers that delete its entries:
    // (delete from msl_payload where _id in (select payload_id from message where message_id = (message.deleted_id/part.deletedmid)))
    // apparently these triggers even when editing within this program, even though the 'ON DELETE CASCADE' stuff does not and
    // foreign key constraints are not enforced... This causes a sort of circular thing here but I think we can just clean up the
    // msl_message table according to still-existing msl_payloads first
    d_database.exec("DELETE FROM msl_message WHERE payload_id NOT IN (SELECT DISTINCT _id FROM msl_payload)");

    Logger::message("  Deleting entries from 'msl_message' not belonging to remaining messages");
    if (d_database.containsTable("sms"))
    {
      d_database.exec("DELETE FROM msl_message WHERE is_mms IS NOT 1 AND message_id NOT IN (SELECT DISTINCT _id FROM sms)");
      d_database.exec("DELETE FROM msl_message WHERE is_mms IS 1 AND message_id NOT IN (SELECT DISTINCT _id FROM " + d_mms_table + ")");
    }
    else
      d_database.exec("DELETE FROM msl_message WHERE message_id NOT IN (SELECT DISTINCT _id FROM " + d_mms_table + ")");

    // now delete msl_payloads from non-existing msl_messages ?
    d_database.exec("DELETE FROM msl_payload WHERE _id NOT IN (SELECT DISTINCT payload_id FROM msl_message)");

    // now delete msl_recipients from non existing payloads?
    d_database.exec("DELETE FROM msl_recipient WHERE payload_id NOT IN (SELECT DISTINCT _id FROM msl_payload)");
  }

  if (d_database.containsTable("story_sends"))
  {
    Logger::message_start("  Deleting entries from 'story_sends' not belonging to remaining message entries");
    d_database.exec("DELETE FROM story_sends WHERE message_id NOT IN (SELECT DISTINCT _id FROM " + d_mms_table + ")");
    Logger::message_end(" (", d_database.changed(), ")");
  }

}